In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load data
df = pd.read_csv("synthetic_data.csv")
categorical_cols = ['Gender', 'Race', 'Education', 'WorkClass', 'Occupation', 'MaritalStatus', 'NativeCountry']
numerical_cols = ['Age', 'HoursPerWeek', 'CapitalGain', 'CapitalLoss']
X = df.drop(columns=['IncomeClass'])
y = df['IncomeClass']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = X_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Define models with best hyperparameters
models = {
    "Logistic Regression": LogisticRegression(solver='saga', penalty='l1', C=0.01, random_state=42),
    "Ridge Classifier": RidgeClassifier(alpha=0.1),
    "SGD Classifier": SGDClassifier(loss="log_loss", alpha=0.01, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_depth=20, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=4, max_depth=None, random_state=42),
    "XGBoost": XGBClassifier(subsample=0.6, n_estimators=100, max_depth=3, learning_rate=0.1, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(num_leaves=31, n_estimators=100, learning_rate=0.1, random_state=42),
    "CatBoost": CatBoostClassifier(learning_rate=0.2, iterations=200, depth=3, verbose=0, random_state=42),
    "Gradient Boosting (GBM)": GradientBoostingClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(weights='uniform', n_neighbors=20, metric='manhattan'),
    "Gaussian Naive Bayes": GaussianNB(var_smoothing=1e-09),
    "Bernoulli Naive Bayes": BernoulliNB(alpha=0.1),
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "N/A"

    print(f"\n{name} Metrics:")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  ROC-AUC: {roc_auc}")



Logistic Regression Metrics:
  Accuracy: 0.6927
  Precision: 0.6984
  Recall: 0.9238
  F1-score: 0.7954
  ROC-AUC: 0.6635109660572469

Ridge Classifier Metrics:
  Accuracy: 0.6937
  Precision: 0.6978
  Recall: 0.9284
  F1-score: 0.7968
  ROC-AUC: N/A

SGD Classifier Metrics:
  Accuracy: 0.6943
  Precision: 0.6961
  Recall: 0.9358
  F1-score: 0.7984
  ROC-AUC: 0.6622808973651901

Random Forest Metrics:
  Accuracy: 0.7327
  Precision: 0.7452
  Recall: 0.8916
  F1-score: 0.8118
  ROC-AUC: 0.7730950107429952

Extra Trees Metrics:
  Accuracy: 0.7313
  Precision: 0.7396
  Recall: 0.9021
  F1-score: 0.8128
  ROC-AUC: 0.7759763463714472

XGBoost Metrics:
  Accuracy: 0.7358
  Precision: 0.7471
  Recall: 0.8941
  F1-score: 0.8140
  ROC-AUC: 0.788857305160196
[LightGBM] [Info] Number of positive: 25866, number of negative: 14134
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if 