In [1]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

import warnings
warnings.simplefilter("ignore")

In [2]:
# Set a seed value
seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

In [3]:
DATASET_PATH = os.getenv("DATASET_PATH")

sample = pd.read_csv(DATASET_PATH)
sample.shape

(38000, 15)

In [4]:
numerical_columns = sample.select_dtypes("number").columns.tolist()[2:]

In [5]:
X = sample[numerical_columns]
y = sample["category"]

## Modeling

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed_value, test_size=.2)

In [7]:
cv = StratifiedKFold(n_splits=5)

In [8]:
def search_pipeline(pipeline, search_space, cv, random_state, X, y):
    # create a randomsearch for pipeline, fits and returns the best model
    search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=search_space,
        cv=cv,
        verbose=1,
        n_jobs=-1,
        random_state=random_state
    )

    search.fit(X_train, y_train)

    best_pipe = search.best_estimator_
    print(best_pipe)
    
    return best_pipe

### Logistic Regression

In [9]:
%%time
# Create a pipeline
pipe_log = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(random_state=seed_value))
    ]
)

search_space_log = [
    {
        "classifier__penalty": ['l2', 'l1'],
        "classifier__C": np.logspace(0, 4, 10),
        "classifier__solver":['newton-cg', 'saga', 'sag', 'liblinear']
    }
]

best_pipe_log = search_pipeline(pipe_log, search_space_log, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(C=7.742636826811269, random_state=42,
                                    solver='sag'))])
CPU times: user 1.79 s, sys: 209 ms, total: 2 s
Wall time: 9.44 s


### Decision Tree

In [10]:
%%time
# Create a pipeline
pipe_tree = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("classifier", DecisionTreeClassifier(random_state=seed_value))
    ]
)

search_space_tree = [
    {
        "classifier__criterion": ['gini', 'entropy'],
        "classifier__max_depth": [2, 4, 6, 8, 10, 12]
    }
]

best_pipe_tree = search_pipeline(pipe_tree, search_space_tree, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('classifier',
                 DecisionTreeClassifier(max_depth=10, random_state=42))])
CPU times: user 524 ms, sys: 95.1 ms, total: 619 ms
Wall time: 2.16 s


### Random Forest

In [11]:
%%time
# Create a pipeline
pipe_rf = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("classifier", RandomForestClassifier(random_state=seed_value))
    ]
)

search_space_rf = [
    {
        "classifier__n_estimators": [10, 100, 1000],
        "classifier__max_depth":[5, 8, 15, 25, 30, None],
        "classifier__min_samples_leaf":[1, 2, 5, 10, 15, 100],
        "classifier__max_leaf_nodes": [2, 5, 10]
    }
]

best_pipe_rf = search_pipeline(pipe_rf, search_space_rf, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('classifier',
                 RandomForestClassifier(max_depth=30, max_leaf_nodes=10,
                                        min_samples_leaf=15, n_estimators=1000,
                                        random_state=42))])
CPU times: user 12 s, sys: 105 ms, total: 12.1 s
Wall time: 40.3 s


### K-Nearest Neighbors

In [12]:
%%time
# Create a pipeline
pipe_knn = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("classifier", KNeighborsClassifier(n_neighbors=6))
    ]
)

search_space_knn = [
    {
        'classifier__n_neighbors': [3, 7, 11],
        'classifier__weights': ['uniform', 'distance']
    }
]

best_pipe_knn = search_pipeline(pipe_knn, search_space_knn, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier',
                 KNeighborsClassifier(n_neighbors=11, weights='distance'))])
CPU times: user 382 ms, sys: 51.7 ms, total: 434 ms
Wall time: 5.39 s


### Multilayer Perceptron

In [13]:
%%time
# Create a pipeline
pipe_mlp = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("classifier", MLPClassifier(max_iter=100, random_state=seed_value))
    ]
)

search_space_mlp = [
    {
        'classifier__hidden_layer_sizes': [(10, 30, 10),(20,)],
        'classifier__activation': ['tanh', 'relu'],
        'classifier__solver': ['sgd', 'adam'],
        'classifier__alpha': [0.0001, 0.05],
        'classifier__learning_rate': ['constant', 'adaptive']
    }
]

best_pipe_mlp = search_pipeline(pipe_mlp, search_space_mlp, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier',
                 MLPClassifier(activation='tanh', alpha=0.05,
                               hidden_layer_sizes=(10, 30, 10), max_iter=100,
                               random_state=42))])
CPU times: user 11.4 s, sys: 90.6 ms, total: 11.5 s
Wall time: 1min 50s


### XGBoost

In [None]:
%%time
# Create a pipeline
pipe_xgb = Pipeline(
    [
        ("classifier", xgb.XGBClassifier(n_jobs=-1, random_state=seed_value))
    ]
)

search_space_xgb = [
    {
        # 'classifier__max_depth'        : [1, 2, 3, 4, 5, 6, 7],
        # 'classifier__gamma'            : [0, 0.5, 1],
        # 'classifier__learning_rate'    : [0.1, 0.01, 0.001],
        # 'classifier__subsample'        : [0.2, 0.4, 0.5, 0.6, 0.7],
        'classifier__reg_alpha'        : [0, 0.5, 1],
        # 'classifier__reg_lambda'       : [1, 1.5, 2, 3, 4.5]
    }
]

best_pipe_xgb = search_pipeline(pipe_xgb, search_space_xgb, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [14]:
def model_evaluation_folds(pipe, cv):
    print("=" * 100)
    model_name = pipe["classifier"].__class__.__name__
    print(f"{model_name} evaluation\n")
    i = 1
    for train_ix, val_ix in cv.split(X_train, y_train):
        
        # select rows
        train_X, val_X = X_train.iloc[train_ix, :], X_train.iloc[val_ix, :]
        train_y, val_y = y_train.iloc[train_ix], y_train.iloc[val_ix]
        
        y_train_pred = pipe.predict(train_X)
        y_train_proba = pipe.predict_proba(train_X)
        
        y_val_pred = pipe.predict(val_X)
        y_val_proba = pipe.predict_proba(val_X)
        
        train_auc = roc_auc_score(train_y, y_train_proba, multi_class="ovr")
        val_auc = roc_auc_score(val_y, y_val_proba, multi_class="ovr")
        
        train_precision = precision_score(train_y, y_train_pred, average='weighted')
        val_precision = precision_score(val_y, y_val_pred, average='weighted')
        
        train_recall = recall_score(train_y, y_train_pred, average='weighted')
        val_recall = recall_score(val_y, y_val_pred, average='weighted')
        
        train_f1 = f1_score(train_y, y_train_pred, average="weighted")
        val_f1 = f1_score(val_y, y_val_pred, average="weighted")
        
        print("FOLD", i)
        print(f"\t - Train Accuracy: {accuracy_score(train_y, y_train_pred)} ; Validation Accuracy {accuracy_score(val_y, y_val_pred)}")
        print(f"\t - Train ROC AUC: {train_auc} ; Validation ROC AUC {val_auc}")
        print(f"\t - Train Precision-Score: {train_precision} ; Validation Precision-Score {val_precision}")
        print(f"\t - Train Recall-Score: {train_recall} ; Validation Recall-Score {val_recall}")
        print(f"\t - Train F1-Score: {train_f1} ; Validation F1-Score {val_f1}")
        
        i += 1

In [19]:
for model in [best_pipe_log, best_pipe_tree, best_pipe_rf, best_pipe_knn, best_pipe_mlp]:
    model_evaluation_folds(model, cv)

LogisticRegression evaluation

FOLD 1
	 - Train Accuracy: 0.5999177631578947 ; Validation Accuracy 0.5991776315789473
	 - Train ROC AUC: 0.769149739671895 ; Validation ROC AUC 0.7694731904175546
	 - Train Precision-Score: 0.5725089925025241 ; Validation Precision-Score 0.5638384771699546
	 - Train Recall-Score: 0.5999177631578947 ; Validation Recall-Score 0.5991776315789473
	 - Train F1-Score: 0.5272061132955319 ; Validation F1-Score 0.5253723837494726
FOLD 2
	 - Train Accuracy: 0.6009046052631579 ; Validation Accuracy 0.5952302631578947
	 - Train ROC AUC: 0.7686396371472718 ; Validation ROC AUC 0.7715395590345778
	 - Train Precision-Score: 0.5667609445301482 ; Validation Precision-Score 0.5768741048023114
	 - Train Recall-Score: 0.6009046052631579 ; Validation Recall-Score 0.5952302631578947
	 - Train F1-Score: 0.5283915201436086 ; Validation F1-Score 0.5205188425725152
FOLD 3
	 - Train Accuracy: 0.5986842105263158 ; Validation Accuracy 0.6041118421052631
	 - Train ROC AUC: 0.76904171

FOLD 1
	 - Train Accuracy: 0.63125 ; Validation Accuracy 0.6304276315789473
	 - Train ROC AUC: 0.8165450164867712 ; Validation ROC AUC 0.8155220183159427
	 - Train Precision-Score: 0.6286956016706798 ; Validation Precision-Score 0.6344351612293087
	 - Train Recall-Score: 0.63125 ; Validation Recall-Score 0.6304276315789473
	 - Train F1-Score: 0.5899412009336299 ; Validation F1-Score 0.5901711147505994
FOLD 2
	 - Train Accuracy: 0.6310444078947368 ; Validation Accuracy 0.63125
	 - Train ROC AUC: 0.8172617701540087 ; Validation ROC AUC 0.8127683408276335
	 - Train Precision-Score: 0.6285205611870038 ; Validation Precision-Score 0.6339046248138712
	 - Train Recall-Score: 0.6310444078947368 ; Validation Recall-Score 0.63125
	 - Train F1-Score: 0.5899733021076345 ; Validation F1-Score 0.5900219790149497
FOLD 3
	 - Train Accuracy: 0.6307976973684211 ; Validation Accuracy 0.6322368421052632
	 - Train ROC AUC: 0.8158428510237882 ; Validation ROC AUC 0.8184437310724496
	 - Train Precision-Score

In [49]:
def evaluate_models(pipes):
    names = []
    train_acc = []
    test_acc = []
    train_roc = []
    test_roc = []
    train_precision = []
    test_precision = []
    train_recall = []
    test_recall = []
    train_f1 = []
    test_f1 = []
    for pipe in pipes:
        names.append(pipe["classifier"].__class__.__name__)
        train_acc.append(accuracy_score(y_train, pipe.predict(X_train)))
        test_acc.append(accuracy_score(y_test, pipe.predict(X_test)))
        train_roc.append(roc_auc_score(y_train, pipe.predict_proba(X_train), multi_class="ovo"))
        test_roc.append(roc_auc_score(y_test, pipe.predict_proba(X_test), multi_class="ovo"))
        train_precision.append(precision_score(y_train, pipe.predict(X_train), average='weighted'))
        test_precision.append(precision_score(y_test, pipe.predict(X_test), average='weighted'))
        train_recall.append(recall_score(y_train, pipe.predict(X_train), average='weighted'))
        test_recall.append(recall_score(y_test, pipe.predict(X_test), average='weighted'))
        train_f1.append(f1_score(y_train, pipe.predict(X_train), average='weighted'))
        test_f1.append(f1_score(y_test, pipe.predict(X_test), average='weighted'))
    
    return pd.DataFrame(
    {
        "model_name": names,
        "train_accuracy": train_acc,
        "test_accuracy": test_acc,
        "train_roc_auc": train_roc,
        "test_roc_auc": test_roc,
        "train_precision": train_precision,
        "test_precision": test_precision,
        "train_recall": train_recall,
        "test_recall": test_recall,
        "train_f1_score": train_f1,
        "test_f1_score": test_f1,
    }
)

In [50]:
evaluate_models(pipes=[best_pipe_log, best_pipe_tree, best_pipe_rf, best_pipe_knn, best_pipe_mlp])

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_roc_auc,test_roc_auc,train_precision,test_precision,train_recall,test_recall,train_f1_score,test_f1_score
0,LogisticRegression,0.59977,0.586711,0.734905,0.727314,0.569865,0.568371,0.59977,0.586711,0.526837,0.510911
1,DecisionTreeClassifier,0.684474,0.635263,0.829208,0.757241,0.686618,0.604978,0.684474,0.635263,0.652327,0.598991
2,RandomForestClassifier,0.621743,0.608947,0.755462,0.745935,0.546609,0.53408,0.621743,0.608947,0.559167,0.545588
3,KNeighborsClassifier,1.0,0.606053,1.0,0.704387,1.0,0.558744,1.0,0.606053,1.0,0.572591
4,MLPClassifier,0.631086,0.614342,0.773826,0.769141,0.630035,0.588298,0.631086,0.614342,0.589987,0.573137


%%time

# Create dictionary with candidate learning algorithms and their hyperparameters
search_space = [
    {
        "classifier": [LogisticRegression(random_state=seed_value)],
        "classifier__penalty": ['l2', 'l1'],
        "classifier__C": np.logspace(0, 4, 10),
        "classifier__solver":['newton-cg', 'saga', 'sag', 'liblinear']
    },
    {
        "classifier": [DecisionTreeClassifier(random_state=seed_value)],
        "classifier__criterion": ['gini', 'entropy'],
        "classifier__max_depth": [2, 4, 6, 8, 10, 12]
    },
    {
        "classifier": [RandomForestClassifier(random_state=seed_value)],
        "classifier__n_estimators": [10, 100, 1000],
        "classifier__max_depth":[5, 8, 15, 25, 30, None],
        "classifier__min_samples_leaf":[1, 2, 5, 10, 15, 100],
        "classifier__max_leaf_nodes": [2, 5, 10]
    },
    {
        'classifier': [KNeighborsClassifier(n_neighbors=6)],
        'classifier__n_neighbors': [3, 7, 11],
        'classifier__weights': ['uniform', 'distance']
    },
    {
        'classifier': [MLPClassifier(max_iter=100, random_state=seed_value)],
        'classifier__hidden_layer_sizes': [(10, 30, 10),(20,)],
        'classifier__activation': ['tanh', 'relu'],
        'classifier__solver': ['sgd', 'adam'],
        'classifier__alpha': [0.0001, 0.05],
        'classifier__learning_rate': ['constant', 'adaptive']
    },
    {
        "classifier": [xgb.XGBClassifier()],
        'classifier__max_depth'        : [1, 2, 3, 4, 5, 6, 7],
        'classifier__gamma'            : [0, 0.5, 1],
        'classifier__learning_rate'    : [0.1, 0.01, 0.001],
        'classifier__subsample'        : [0.2, 0.4, 0.5, 0.6, 0.7],
        'classifier__reg_alpha'        : [0, 0.5, 1],
        'classifier__reg_lambda'       : [1, 1.5, 2, 3, 4.5]
    }
]



In [51]:
def calculate_feature_importance(pipe):
    x = pd.DataFrame({
        "features": X_train.columns.tolist(),
        "importances": pipe["classifier"].feature_importances_
    }).sort_values("importances", ascending=False)
    
    x["cum_sum"] = np.cumsum(x["importances"])
    
    return x

In [53]:
def save_model(model, path):
    pd.to_pickle(model, path)

In [55]:
save_model(best_pipe_tree, os.getenv("MODEL_PATH"))