In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load dataset
df = pd.read_csv("synthetic_data.csv")
categorical_cols = ['Gender', 'Race', 'Education', 'WorkClass', 'Occupation', 'MaritalStatus', 'NativeCountry']
numerical_cols = ['Age', 'HoursPerWeek', 'CapitalGain', 'CapitalLoss']
X = df.drop(columns=['IncomeClass'])
y = df['IncomeClass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = X_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Hyperparameter tuning
param_grids = {
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },
    "Extra Trees": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 6, 10],
        "learning_rate": [0.01, 0.1, 0.2],
        "subsample": [0.6, 0.8, 1.0],
    },
    "LightGBM": {
        "n_estimators": [50, 100, 200],
        "num_leaves": [31, 50, 100],
        "learning_rate": [0.01, 0.1, 0.2],
    },
    "CatBoost": {
        "iterations": [50, 100, 200],
        "depth": [3, 6, 10],
        "learning_rate": [0.01, 0.1, 0.2],
    },
    "Gradient Boosting": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 10],
    },
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"],
    },
    "Ridge Classifier": {
        "alpha": [0.1, 1, 10],
    },
    "SGD Classifier": {
        "alpha": [0.0001, 0.001, 0.01],
        "loss": ["hinge", "log_loss"],
    },
    "KNN": {
        "n_neighbors": [3, 5, 10, 20],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },
    "GaussianNB": {
        "var_smoothing": [1e-9, 1e-8, 1e-7],
    },
    "BernoulliNB": {
        "alpha": [0.1, 0.5, 1],
    }
}

best_models = {}

for model_name, param_grid in param_grids.items():
    print(f"\nTuning {model_name}...")
    model = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "Extra Trees": ExtraTreesClassifier(random_state=42),
        "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False),
        "LightGBM": LGBMClassifier(),
        "CatBoost": CatBoostClassifier(verbose=0),
        "Gradient Boosting": GradientBoostingClassifier(),
        "Logistic Regression": LogisticRegression(),
        "Ridge Classifier": RidgeClassifier(),
        "SGD Classifier": SGDClassifier(random_state=42),
        "KNN": KNeighborsClassifier(),
        "GaussianNB": GaussianNB(),
        "BernoulliNB": BernoulliNB(),
    }[model_name]
    
    search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=3, scoring='accuracy', random_state=42, n_jobs=-1)
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    best_models[model_name] = best_model
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else "N/A"
    
    print(f"Best params: {search.best_params_}")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  ROC-AUC: {roc_auc}")



Tuning Random Forest...
Best params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 20}
  Accuracy: 0.7327
  Precision: 0.7452
  Recall: 0.8916
  F1-score: 0.8118
  ROC-AUC: 0.7730950107429952

Tuning Extra Trees...
Best params: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': None}
  Accuracy: 0.7313
  Precision: 0.7396
  Recall: 0.9021
  F1-score: 0.8128
  ROC-AUC: 0.7759763463714472

Tuning XGBoost...


Parameters: { "use_label_encoder" } are not used.



Best params: {'subsample': 0.6, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}
  Accuracy: 0.7392
  Precision: 0.7544
  Recall: 0.8848
  F1-score: 0.8144
  ROC-AUC: 0.789176393412947

Tuning LightGBM...
[LightGBM] [Info] Number of positive: 25866, number of negative: 14134
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 655
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.646650 -> initscore=0.604346
[LightGBM] [Info] Start training from score 0.604346
Best params: {'num_leaves': 31, 'n_estimators': 100, 'learning_rate': 0.1}
  Accuracy: 0.7355
  Precision: 0.7616
  Recall: 0.8604
  F1-score: 0.8080
  ROC-AUC: 0.7848247701945267

Tuning CatBoost...


7 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Dell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dell\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_

Best params: {'learning_rate': 0.2, 'iterations': 50, 'depth': 6}
  Accuracy: 0.7360
  Precision: 0.7530
  Recall: 0.8806
  F1-score: 0.8118
  ROC-AUC: 0.7898477458179874

Tuning Gradient Boosting...
Best params: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}
  Accuracy: 0.7318
  Precision: 0.7583
  Recall: 0.8591
  F1-score: 0.8056
  ROC-AUC: 0.7845315924068508

Tuning Logistic Regression...
Best params: {'solver': 'saga', 'penalty': 'l1', 'C': 0.01}
  Accuracy: 0.6927
  Precision: 0.6984
  Recall: 0.9238
  F1-score: 0.7954
  ROC-AUC: 0.663510703451182

Tuning Ridge Classifier...
Best params: {'alpha': 0.1}
  Accuracy: 0.6937
  Precision: 0.6978
  Recall: 0.9284
  F1-score: 0.7968
  ROC-AUC: N/A

Tuning SGD Classifier...




Best params: {'loss': 'log_loss', 'alpha': 0.01}
  Accuracy: 0.6943
  Precision: 0.6961
  Recall: 0.9358
  F1-score: 0.7984
  ROC-AUC: 0.6622808973651901

Tuning KNN...
Best params: {'weights': 'uniform', 'n_neighbors': 20, 'metric': 'manhattan'}
  Accuracy: 0.7184
  Precision: 0.7370
  Recall: 0.8778
  F1-score: 0.8013
  ROC-AUC: 0.7272683703993771

Tuning GaussianNB...
Best params: {'var_smoothing': 1e-09}
  Accuracy: 0.6767
  Precision: 0.7008
  Recall: 0.8727
  F1-score: 0.7774
  ROC-AUC: 0.6349248296704236

Tuning BernoulliNB...




Best params: {'alpha': 0.1}
  Accuracy: 0.6467
  Precision: 0.6467
  Recall: 1.0000
  F1-score: 0.7854
  ROC-AUC: 0.5985993205243141
