In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler,  OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.metrics import classification_report, confusion_matrix

## Search model pipeline

In [2]:
df = pd.read_csv("../data/data_adults.csv")
df.head()

df_fil = df.drop(["fnlwgt", "education-num"], axis=1)
df_fil.head(2)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [3]:
X = df_fil.drop("income", axis=1)
X.head(2)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States


In [4]:
y = df_fil.income.isin(['>50K.','>50K'])
y.head(2)

0    False
1    False
Name: income, dtype: bool

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

numerical_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

preprocessor = make_column_transformer(
    (numerical_transformer, numerical_features),
    (categorical_transformer, categorical_features),
).set_output(transform='pandas')

In [6]:
class ModelSearchTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model, param_grid):
        self.model = model
        self.param_grid = param_grid
        self.best_estimator_ = None

    def fit(self, X, y=None):
        search = GridSearchCV(clone(self.model), param_grid=self.param_grid, scoring='f1', n_jobs=-1, verbose=1)
        search.fit(X, y)
        self.best_estimator_ = search.best_estimator_
        return self

    def transform(self, X):
        return self.best_estimator_.predict(X).reshape(-1, 1)


rf_transformer = ModelSearchTransformer(
    RandomForestClassifier(random_state=20241030),
    {
        'n_estimators': [120, 150, 180],
        'max_depth': [2, 5, 8],
        'min_samples_split': [4, 6, 8],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]
    }
)

gb_transformer = ModelSearchTransformer(
    GradientBoostingClassifier(random_state=20241030),
    {
        'n_estimators': [120, 150, 180],
        'max_depth': [2, 5, 8],
        'min_samples_split': [4, 6, 8],
        'min_samples_leaf': [1, 2],
        'learning_rate': [0.1, 0.05]
    }
)

nb_transformer = ModelSearchTransformer(
    GaussianNB(),
    {}
)

svc_transformer = ModelSearchTransformer(
    SVC(random_state=20241030),
    {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    }
)

lr_transformer = ModelSearchTransformer(
    LogisticRegression(random_state=20241030, max_iter=10000),
    {
        'C': [0.1, 1, 10],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
)


models_search_pipeline = make_column_transformer(
    (make_pipeline(
        preprocessor,
        rf_transformer
    ), make_column_selector()),
    (make_pipeline(
        preprocessor,
        gb_transformer
    ), make_column_selector()),
    (make_pipeline(
        preprocessor,
        nb_transformer
    ), make_column_selector()),
    (make_pipeline(
        preprocessor,
        svc_transformer
    ), make_column_selector()),
    (make_pipeline(
        preprocessor,
        lr_transformer
    ), make_column_selector()),
    n_jobs=-1
)

In [7]:
models_search_pipeline.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [8]:
for pipe in models_search_pipeline.transformers_:
    best_model = pipe[1].named_steps['modelsearchtransformer'].best_estimator_
    print(best_model)
    y_pred = pipe[1].transform(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(f"TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")
    print(classification_report(y_test, y_pred))

RandomForestClassifier(max_depth=8, min_samples_leaf=2, min_samples_split=8,
                       n_estimators=150, random_state=20241030)
TN: 7206, FP: 208, FN: 1237, TP: 1118
              precision    recall  f1-score   support

       False       0.85      0.97      0.91      7414
        True       0.84      0.47      0.61      2355

    accuracy                           0.85      9769
   macro avg       0.85      0.72      0.76      9769
weighted avg       0.85      0.85      0.84      9769

GradientBoostingClassifier(max_depth=8, min_samples_leaf=2, min_samples_split=6,
                           n_estimators=150, random_state=20241030)
TN: 7007, FP: 407, FN: 836, TP: 1519
              precision    recall  f1-score   support

       False       0.89      0.95      0.92      7414
        True       0.79      0.65      0.71      2355

    accuracy                           0.87      9769
   macro avg       0.84      0.80      0.81      9769
weighted avg       0.87      0.87   

In [9]:
# the best model is the GradientBoostingClassifier, we are going to save it
best_overall_model = models_search_pipeline.transformers_[1][1].named_steps['modelsearchtransformer'].best_estimator_
joblib.dump(best_overall_model, 'models_search_pipeline.joblib')

['models_search_pipeline.joblib']