# Setting up notebook

In [None]:
import os, sys, logging, datetime, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import zscore

# adding module directory to path
modules_path = os.path.join(os.getcwd(),'modules')
sys.path.insert(0, modules_path)

from config_handler import ConfigHandling
from data_handler import DataHandling
# from feature_handler import FeatureHandling
from model_handler import ModelHandling

In [None]:
cfg = ConfigHandling(job_directory = os.getcwd(), config_filename='config.json')

# Define helper functions

In [None]:
def plot_feature_importance(df):
    df = df.sort_values('feature_importance', ascending=False)
    plt.figure(figsize=(6, 0.3*len(df)))
    sns.barplot(data=df[:10], y='feature', x='feature_importance', orient='h')

# Get training and prediction (test) data
- Output:
    - Training: 80% of provided training data from train.csv
    - Test: 20% of provided training data from train.csv
    - For the purposes of the case study and due to the fact that there is only 1 continuous variable, no outlier removal due to the nature of the but noted that in production will have steps to remove outliers.

In [None]:
data = DataHandling(cfg)

data_full = data.read_data(dataset='train_clean')
data_submission = data.read_data(dataset='test_clean')

In [None]:
categorical_column = [
    'gender_binary', 
    'driving_license', 
    'previously_insured',
    'vehicle_damage_binary'
]

matchers = ['policy_sales_channel_','region_code_', 'vehicle_age_']
ohe_column = [
    s for s in data_full.columns if any(xs in s for xs in matchers)
]

numeric_column = [
    "age", "annual_premium", 'days_since_insured'
]

features = categorical_column + ohe_column + numeric_column

target_column = ['response']

id_column = data_submission['cust_id']

# Reorder columns
data_full = data_full[features + target_column]
data_submission = data_submission[features]

In [None]:
model = ModelHandling()

X_tr, X_tt, y_tr, y_tt = model.train_test_split(data_full[features], data_full[target_column])

# Fitting on Training Data

In [None]:
# For model training - latest model has been stored in the model directory.

# trained_model = model.fit(X_tr, y_tr)

In [None]:
trained_model = model.load_model('model/model_v1.pck')

result_train = trained_model.predict(X_tr)
result_train_proba = trained_model.predict_proba(X_tr)[:, 1]

In [None]:
train_accuracy, train_precision, train_recall, train_f1, train_roc_auc = model.eval_results(y_tr, result_train, result_train_proba)

print(f"In sample accuracy: {train_accuracy}\nIn sample precision: {train_precision}\nIn sample recall: {train_recall}\nIn sample f1: {train_f1}\nIn sample roc auc: {train_roc_auc}")

In [None]:
# To save the model if required

# model.save_model(trained_model, 'model/model_v1.pck')

# Evaluating on Test Data

In [None]:
result_test = trained_model.predict(X_tt)
result_test_proba = trained_model.predict_proba(X_tt)[:, 1]
test_accuracy, test_precision, test_recall, test_f1, test_roc_auc = model.eval_results(y_tt, result_test, result_test_proba)
print(f"Out sample accuracy: {test_accuracy}\nOut sample precision: {test_precision}\nOut sample recall: {test_recall}\nOut sample f1: {test_f1}\nOut sample roc auc: {test_roc_auc}")

# Retraining model on full data

In [None]:
full_model = model.fit(data_full[features], data_full[target_column])

In [None]:
fs = full_model.feature_importance()
plot_feature_importance(fs)

# Predicting Submission data

In [None]:
result_submission_proba = trained_model.predict_proba(data_submission)[:, 1]

In [None]:
submission_data = pd.DataFrame({
    'cust_id': id_column,
    'propensity': result_submission_proba
})

submission_data.to_csv('data/submission_data.csv', index=False)

# Comparison with Rule Based system
- From an underwriting perspective:
    - Renew if (1) no previous vehicle damage and (2) have previous vehicle damage but is previously insured

In [None]:
data_comparison = X_tt.copy()
data_comparison['response'] = y_tt
data_comparison['response_gb'] = result_test
data_comparison['response_gb_prob'] = result_test_proba

data_comparison['response_uw'] = 0
data_comparison['response_uw'][(data_comparison['vehicle_damage_binary']==1) & (data_comparison['previously_insured']==1)] = 1
data_comparison['response_uw_prob'] = 1 # placeholder as there is no probability

In [None]:
uw_accuracy, uw_precision, uw_recall, uw_f1, uw_roc_auc  = model.eval_results(y_tt, data_comparison['response_uw'], data_comparison['response_uw_prob'])
print(f"Out sample accuracy: {uw_accuracy}\nOut sample precision: {uw_precision}\nOut sample recall: {uw_recall}\nOut sample f1: {uw_f1}\nOut sample roc auc: {uw_roc_auc}")

In [None]:
gb_accuracy, gb_precision, gb_recall, gb_f1, gb_roc_auc = model.eval_results(y_tt, data_comparison['response_gb'], data_comparison['response_gb_prob'])
print(f"Out sample accuracy: {gb_accuracy}\nOut sample precision: {gb_precision}\nOut sample recall: {gb_recall}\nOut sample f1: {gb_f1}\nOut sample roc auc: {gb_roc_auc}")

# Additional Steps

## Getting Cross Validation Scores

In [None]:
# eval_means, eval_mins, eval_maxs = model.cross_val(X_tr, y_tr, 'precision')

In [None]:
# print(f"CV mean score: {eval_means} | Max score: {eval_maxs} | Min score: {eval_mins}")