In [None]:
import os
import sys
from dotenv import find_dotenv

# adds project directory to syspath to use python code from project directory in notebook
sys.path.append(os.path.dirname(find_dotenv()))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, precision_recall_curve, ConfusionMatrixDisplay
from xgboost import XGBClassifier

from lib.constants import DATASET_PATH
from lib.config import EDUCATION_MAPPING, OUTCOME_MAPPING, ALL_COLUMNS, JOB_IMPUTATION_PARAMS_DICT, COMMUNICATION_IMPUTATION_PARAMS_DICT, ONE_HOT_ENCODING_COLUMNS, SCALABLE_NUMERIC_COLUMNS
from lib.transformers import CallDurationInMinutesTransformer, CallDurationInSecondsTransformer, OrdinalMappingTransformer, SelectFeaturesTransformer, SimpleImputationTransformer
from lib.trainer import run_classification_grid_search

In [None]:
car_insurance_dataset = pd.read_csv(DATASET_PATH)
car_insurance_dataset.shape

In [None]:
strat_train_set, strat_test_set = train_test_split(
    car_insurance_dataset, test_size=0.2, stratify=car_insurance_dataset['CarInsurance'], random_state=42)

strat_train_set.shape, strat_test_set.shape

In [None]:
car_insurance = car_insurance_dataset.drop('CarInsurance', axis=1)
car_insurance_labels = car_insurance_dataset['CarInsurance'].copy()

In [None]:
car_insurance.head()

In [None]:
feature_engineering_pipeline = make_pipeline(
    CallDurationInMinutesTransformer(),
    CallDurationInSecondsTransformer(),
    OrdinalMappingTransformer(feature_column='Education', mapping_dict=EDUCATION_MAPPING),
    OrdinalMappingTransformer(feature_column='Outcome', mapping_dict=OUTCOME_MAPPING),
    SimpleImputationTransformer(feature_column='Job', imputation_params_dict=JOB_IMPUTATION_PARAMS_DICT),
    SimpleImputationTransformer(feature_column='Communication', imputation_params_dict=COMMUNICATION_IMPUTATION_PARAMS_DICT),
    SelectFeaturesTransformer(features=ALL_COLUMNS),
)

features = feature_engineering_pipeline.fit_transform(car_insurance)
features.head()

In [None]:
features.info()

In [None]:
encoder_pipeline = ColumnTransformer([
    ('categorical_encoding', OneHotEncoder(), ONE_HOT_ENCODING_COLUMNS),
    ('numerical_scaling', StandardScaler(), SCALABLE_NUMERIC_COLUMNS),
], remainder='passthrough')

feature_vector_pipeline = make_pipeline(
    feature_engineering_pipeline,
    encoder_pipeline,
)

In [None]:
GRID_SEARCH_PARAMS = {
    'Linear SVM': (SVC(), {}),
    'Decision Tree': (DecisionTreeClassifier(), {}),
    'Random Forest': (RandomForestClassifier(), {}),
    'Neural Net': (MLPClassifier(), {}),
    'AdaBoost': (AdaBoostClassifier(), {}),
    'Naive Bayes': (GaussianNB(), {}),
    'XGB': (XGBClassifier(objective='binary:logistic'), {}),
}

best_score, best_classifier, best_classifier_name, best_params = run_classification_grid_search(
    params_grid=GRID_SEARCH_PARAMS, pipeline=feature_vector_pipeline, X=car_insurance, y=car_insurance_labels)

In [None]:
best_classifier

In [None]:
car_insurance_labels_pred = cross_val_predict(best_classifier, car_insurance, car_insurance_labels, cv=3)

ConfusionMatrixDisplay.from_predictions(car_insurance_labels, car_insurance_labels_pred)
plt.show()

In [None]:
precision_score(car_insurance_labels, car_insurance_labels_pred)

In [None]:
recall_score(car_insurance_labels, car_insurance_labels_pred)