In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

df = pd.read_csv("synthetic_data.csv")
categorical_cols = ['Gender', 'Race', 'Education', 'WorkClass', 'Occupation', 'MaritalStatus', 'NativeCountry']
numerical_cols = ['Age', 'HoursPerWeek', 'CapitalGain', 'CapitalLoss']
X = df.drop(columns=['IncomeClass'])
y = df['IncomeClass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

label_encoders = {}  
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = X_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1) 
    label_encoders[col] = le  

scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

models = {
    "Logistic Regression": LogisticRegression(),
    "Ridge Classifier": RidgeClassifier(),
    "SGD Classifier": SGDClassifier(loss="log_loss", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "Gradient Boosting (GBM)": GradientBoostingClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Gaussian Naive Bayes": GaussianNB(),
    "Bernoulli Naive Bayes": BernoulliNB(),
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "N/A"

    print(f"\n{name} Metrics:")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  ROC-AUC: {roc_auc}")


Logistic Regression Metrics:
  Accuracy: 0.6920
  Precision: 0.6980
  Recall: 0.9230
  F1-score: 0.7949
  ROC-AUC: 0.6645103790889242

Ridge Classifier Metrics:
  Accuracy: 0.6937
  Precision: 0.6978
  Recall: 0.9284
  F1-score: 0.7968
  ROC-AUC: N/A

SGD Classifier Metrics:
  Accuracy: 0.6953
  Precision: 0.6921
  Recall: 0.9525
  F1-score: 0.8017
  ROC-AUC: 0.6613275060463952

Random Forest Metrics:
  Accuracy: 0.7097
  Precision: 0.7430
  Recall: 0.8426
  F1-score: 0.7897
  ROC-AUC: 0.7477549260411596

Extra Trees Metrics:
  Accuracy: 0.7058
  Precision: 0.7420
  Recall: 0.8356
  F1-score: 0.7860
  ROC-AUC: 0.7339360696914479

XGBoost Metrics:
  Accuracy: 0.7316
  Precision: 0.7567
  Recall: 0.8622
  F1-score: 0.8060
  ROC-AUC: 0.7740122280763437
[LightGBM] [Info] Number of positive: 25866, number of negative: 14134
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if