In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

datasets = {
    "Baseline": "synthetic_data.csv",
    "Gaussian": "dataset_with_gaussian_noise.csv",
    "Laplace": "dataset_with_laplace_noise.csv"
}

categorical_cols = ['Gender', 'Race', 'Education', 'WorkClass', 'Occupation', 'MaritalStatus', 'NativeCountry']
numerical_cols = ['Age', 'HoursPerWeek', 'CapitalGain', 'CapitalLoss']

models = {
    "Logistic Regression": LogisticRegression(solver='saga', penalty='l1', C=0.01, random_state=42),
    "Ridge Classifier": RidgeClassifier(alpha=0.1),
    "SGD Classifier": SGDClassifier(loss="log_loss", alpha=0.01, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_depth=20, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=4, max_depth=None, random_state=42),
    "XGBoost": XGBClassifier(subsample=0.6, n_estimators=100, max_depth=3, learning_rate=0.1, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(num_leaves=31, n_estimators=100, learning_rate=0.1, random_state=42),
    "CatBoost": CatBoostClassifier(learning_rate=0.2, iterations=200, depth=3, verbose=0, random_state=42),
    "Gradient Boosting (GBM)": GradientBoostingClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(weights='uniform', n_neighbors=20, metric='manhattan'),
    "Gaussian Naive Bayes": GaussianNB(var_smoothing=1e-09),
    "Bernoulli Naive Bayes": BernoulliNB(alpha=0.1),
}

results = {}

for dataset_name, file_path in datasets.items():
    df = pd.read_csv(file_path)
    X = df.drop(columns=['IncomeClass'])
    y = df['IncomeClass']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = X_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
        label_encoders[col] = le

    scaler = StandardScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

        acc = round(accuracy_score(y_test, y_pred), 4)
        prec = round(precision_score(y_test, y_pred), 4)
        rec = round(recall_score(y_test, y_pred), 4)
        f1 = round(f1_score(y_test, y_pred), 4)
        roc_auc = round(roc_auc_score(y_test, y_proba), 4) if y_proba is not None else "N/A"

        if name not in results:
            results[name] = {}
        
        results[name][f"Accuracy ({dataset_name})"] = acc
        results[name][f"Precision ({dataset_name})"] = prec
        results[name][f"Recall ({dataset_name})"] = rec
        results[name][f"F1-score ({dataset_name})"] = f1
        results[name][f"ROC-AUC ({dataset_name})"] = roc_auc

results_df = pd.DataFrame(results)
results_df.to_csv("model_comparison_results.csv")


[LightGBM] [Info] Number of positive: 25866, number of negative: 14134
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 655
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.646650 -> initscore=0.604346
[LightGBM] [Info] Start training from score 0.604346
[LightGBM] [Info] Number of positive: 25866, number of negative: 14134
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1058
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 11
[LightGBM] [Info] [bi