In [1]:
# system
import warnings
import os

# processing
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler

# modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier

# performance 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, r2_score, mean_squared_error,  \
confusion_matrix, precision_recall_fscore_support, classification_report

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
%config InlineBackend.figure_format = 'retina'

In [2]:
df = pd.read_csv('../artifacts/data.csv')
df1 = df.query('Premium_Level == 1')
df2 = df.query('Premium_Level > 1')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Premium_Level,Premium_Log
0,1,1,44,1,28.0,0,3,1,40454.0,26.0,217,1,3.0,4.606961
1,2,1,76,1,3.0,0,2,0,33536.0,26.0,183,0,3.0,4.525511
2,3,1,47,1,28.0,0,3,1,38294.0,26.0,27,1,3.0,4.583131
3,4,1,21,1,11.0,1,1,0,28619.0,152.0,203,0,3.0,4.456654
4,5,0,29,1,41.0,1,1,0,27496.0,152.0,39,0,3.0,4.43927


# Trial Approach: 
    Predicting health insurance premium using given (vehicle insurance related) parameters, and further employ that for identifying customer interest for health insurance.

In [5]:
X, y = df.drop(['id', 'Premium_Level', 'Premium_Log', 'Annual_Premium', 'Response'], axis=1), df[['Annual_Premium']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=df['Premium_Level'], random_state=4)

regress_perf = pd.DataFrame(columns=['model', 'rmse', 'r2'])
categoric_perf = pd.DataFrame(columns=['model', 'precision', 'f1', 'roc-auc'])

## TRY 1: full dataset for predicting annual premium

In [6]:
# Define the parameter grid
dt_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# grid search with cross-validation on decision tree
grid_search = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=dt_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
dt1 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = dt1.predict(Xtest)
score = dt1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['dt1', rmse, r2]

### skip grid search for rf and xgb

In [None]:
# Define the parameter grid
rf_grid = {
    'n_estimators': [None, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=rf_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
rf1 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = dt1.predict(Xtest)
score = dt1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf1', rmse, r2]

In [None]:
# Evaluate the best estimator on the test set
ypred = rf1.predict(Xtest)
score = rf1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf1', rmse, r2]

dt score: 0.26018211734795027


grid search with ensemble learning is consuming 360+ minutes for training. Not a viable option.


```xgb_grid = {
    'n_estimators': [None, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=xgb_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
xgb1 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = xgb1.predict(Xtest)
score = xgb1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['xgb1', rmse, r2]
```

### skipped

In [None]:
# Try RandomForestRegressor
rf1 = RandomForestRegressor()
rf1.fit(Xtrain, ytrain)
rf1_score = rf1.score(Xtest, ytest)
print(f'RandomForestRegressor R² score: {rf1_score}')

ypred = rf1.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf1', rmse, r2]


# Try GradientBoostingRegressor
xgb1 = GradientBoostingRegressor()
xgb1.fit(Xtrain, ytrain)
xgb1_score = xgb1.score(Xtest, ytest)
print(f'GradientBoostingRegressor R² score: {xgb1_score}')

ypred = xgb1.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['xgb1', rmse, r2]

RandomForestRegressor R² score: 0.10834084706163039
GradientBoostingRegressor R² score: 0.2495598683995286


## TRY 2: full dataset for predicting log10 of premium

In [None]:
X, y = df.drop(['id', 'Premium_Level', 'Premium_Log', 'Annual_Premium', 'Response'], axis=1), df[['Premium_Log']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=df['Premium_Level'], random_state=4)

In [None]:
# Define the parameter grid
dt_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# grid search with cross-validation on decision tree
grid_search = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=dt_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
dt2 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = dt2.predict(Xtest)
score = dt2.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['dt2', rmse, r2]

dt score: 0.3098752356183905


In [None]:
# Try RandomForestRegressor
rf2 = RandomForestRegressor()
rf2.fit(Xtrain, ytrain)
rf2_score = rf2.score(Xtest, ytest)
print(f'RandomForestRegressor R² score: {rf2_score}')

ypred = rf2.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf2', rmse, r2]


# Try GradientBoostingRegressor
xgb2 = GradientBoostingRegressor()
xgb2.fit(Xtrain, ytrain)
xgb2_score = xgb2.score(Xtest, ytest)
print(f'GradientBoostingRegressor R² score: {xgb2_score}')

ypred = xgb2.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['xgb2', rmse, r2]

RandomForestRegressor R² score: 0.1982139141502972
GradientBoostingRegressor R² score: 0.3070919729579141


In [None]:
regress_perf

Unnamed: 0,model,rmse,r2
0,dt1,14960.705996,0.255642
1,rf1,16374.220035,0.108341
2,xgb1,15021.704399,0.24956
3,dt2,0.362759,0.309875
4,rf2,0.391006,0.198214
5,xgb2,0.363489,0.307092


Log10 has improved the model performance.

## TRY 3: dataset excluding anomlies premium

In [None]:
X, y = df2.drop(['id', 'Premium_Level', 'Premium_Log', 'Annual_Premium', 'Response'], axis=1), df2[['Premium_Log']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=df2['Premium_Level'], random_state=4)

In [None]:
# Define the parameter grid
dt_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# grid search with cross-validation on decision tree
grid_search = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=dt_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
dt3 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = dt3.predict(Xtest)
score = dt3.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['dt3', rmse, r2]

dt score: 0.21753082078076968


In [None]:
# Try RandomForestRegressor
rf3 = RandomForestRegressor()
rf3.fit(Xtrain, ytrain)
rf3_score = rf3.score(Xtest, ytest)
print(f'RandomForestRegressor R² score: {rf3_score}')

ypred = rf3.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf3', rmse, r2]


# Try GradientBoostingRegressor
xgb3 = GradientBoostingRegressor()
xgb3.fit(Xtrain, ytrain)
xgb3_score = xgb3.score(Xtest, ytest)
print(f'GradientBoostingRegressor R² score: {xgb3_score}')

ypred = xgb3.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['xgb3', rmse, r2]

RandomForestRegressor R² score: 0.05642608188031806
GradientBoostingRegressor R² score: 0.2175443918623583


In [None]:
regress_perf

Unnamed: 0,model,rmse,r2
0,dt1,14960.705996,0.255642
1,rf1,16374.220035,0.108341
2,xgb1,15021.704399,0.24956
3,dt2,0.362759,0.309875
4,rf2,0.391006,0.198214
5,xgb2,0.363489,0.307092
6,dt3,0.112231,0.217531
7,rf3,0.123245,0.056426
8,xgb3,0.11223,0.217544


The model performance without anomoly data is reduced, indicating that the old models were predicting and performing good moslty the annomly records.

## TRY 4: Predict Premium Level followed by annual premium with predicted premium level

In [None]:
X, y = df2.drop(['id', 'Premium_Level', 'Premium_Log', 'Annual_Premium', 'Response'], axis=1), df2[['Premium_Level']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=y, random_state=4)

In [None]:
# parameter options to fit model
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Initialize the grid search
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
dtc1 = grid_search.best_estimator_

# evaluate the best estimator on the test set
ypred = dtc1.predict(Xtest)
score = dtc1.score(Xtest, ytest)
yproba = dtc1.predict_proba(Xtest)
print(f'Best decision tree f1 score: {score}')

prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 4) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, yproba, multi_class='ovr'), 4)
categoric_perf.loc[categoric_perf.index.size] = ['dtc1', prec, f1, roc]

Best decision tree f1 score: 0.9183514282702646


In [None]:
# Try Random Forest
rfc1 = RandomForestClassifier()
rfc1.fit(Xtrain, ytrain)
rfc1_score = rfc1.score(Xtest, ytest)
print(f'RandomForestClassifier accuracy score: {rfc1_score}')

# evaluate the best estimator on the test set
ypred = rfc1.predict(Xtest)
yproba = rfc1.predict_proba(Xtest)
prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 3) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, yproba, multi_class='ovr'), 4)
categoric_perf.loc[categoric_perf.index.size] = ['rfc1', prec, f1, roc]


# Try Gradient Boosting
gbc1 = GradientBoostingClassifier()
gbc1.fit(Xtrain, ytrain)
gbc1_score = gbc1.score(Xtest, ytest)
print(f'GradientBoostingClassifier accuracy score: {gbc1_score}')

# evaluate the best estimator on the test set
ypred = gbc1.predict(Xtest)
yproba = gbc1.predict_proba(Xtest)
prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 3) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, yproba, multi_class='ovr'), 4)
categoric_perf.loc[categoric_perf.index.size] = ['gbc1', prec, f1, roc]

RandomForestClassifier accuracy score: 0.945767892906082
GradientBoostingClassifier accuracy score: 0.9573416253821018


In [None]:
categoric_perf

Unnamed: 0,model,precision,f1,roc-auc
0,dtc1,"[0.0122, 0.9587, 0.0761, 0.0058]","[0.0144, 0.9575, 0.0765, 0.0063]",0.5135
1,rfc1,"[0.0133, 0.958, 0.0821, 0.0233]","[0.004, 0.972, 0.042, 0.011]",0.5708
2,gbc1,"[0.0, 0.9574, 0.0, 0.3333]","[0.0, 0.978, 0.0, 0.013]",0.7526


In [None]:
y.value_counts()

Premium_Level
3.0              302750
4.0               11672
2.0                1323
5.0                 487
dtype: int64

    The model can further be impproved with hyperparameter tunning, however given the finite resource and time predicting health insurance premium using vehicle insurance related parameters is put on-hold.

# Vehical Insurance Interest Prediction

In [3]:
df2.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Premium_Level,Premium_Log
0,1,1,44,1,28.0,0,3,1,40454.0,26.0,217,1,3.0,4.606961
1,2,1,76,1,3.0,0,2,0,33536.0,26.0,183,0,3.0,4.525511
2,3,1,47,1,28.0,0,3,1,38294.0,26.0,27,1,3.0,4.583131
3,4,1,21,1,11.0,1,1,0,28619.0,152.0,203,0,3.0,4.456654
4,5,0,29,1,41.0,1,1,0,27496.0,152.0,39,0,3.0,4.43927


In [4]:
# customer interest prediction models performance
interest_perf = pd.DataFrame(columns=['model', 'precision', 'f1', 'roc'])
X, y = df2.drop(columns=['Response']), df2['Response']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, stratify=df2[['Response']], random_state=4)

In [5]:
np.unique(ytest)

array([0, 1], dtype=int64)

In [6]:
lr = LogisticRegression()
lr.fit(Xtrain, ytrain)
ypred = lr.predict(Xtest)

score = lr.score(Xtest, ytest)
ypred = lr.predict(Xtest)
print(f'default lr accuracy: {score}')

prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 4) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, ypred, multi_class='ovr'), 4)
interest_perf.loc[interest_perf.index.size] = ['lr', prec, f1, roc]

default lr accuracy: 0.8791879456733126


In [7]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [8]:
# estimate time: atleast 72(=3*4*1*3*2) times more than lr (without grid search)
lr_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['lbfgs'],
    'max_iter': [100, 200, 300],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid=lr_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
int_lr = grid_search.best_estimator_

# evaluate the best estimator on the test set
ypred = int_lr.predict(Xtest)
acc = accuracy_score(ytest, ypred)
print(f'best lr accuracy: {acc}')

prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 4) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, ypred, multi_class='ovr'), 4)
interest_perf.loc[interest_perf.index.size] = ['int_lr', prec, f1, roc]

best lr accuracy: 0.6311761822695148


In [9]:
interest_perf

Unnamed: 0,model,precision,f1,roc
0,lr,"[0.8792, 0.0]","[0.9357, 0.0]",0.5
1,int_lr,"[0.9128, 0.1752]","[0.7537, 0.2661]",0.5977


In [10]:
# training decision tree on default parameters
dt = DecisionTreeClassifier()
dt.fit(Xtrain, ytrain)

ypred = dt.predict(Xtest)
acc = accuracy_score(ytest, ypred)
print(f'default decision tree accuracy: {acc}')

# performance table
prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 4) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, ypred, multi_class='ovr'), 4)
interest_perf.loc[interest_perf.index.size] = ['dt', prec, f1, roc]

default decision tree accuracy: 0.8261419513969042


In [11]:
# parameter options to fit model
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Initialize the grid search
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
int_dt1 = grid_search.best_estimator_

# model accuracy
ypred = int_dt1.predict(Xtest)
acc = accuracy_score(ytest, ypred)
print(f'best decision tree accuracy: {acc}')

# record performance
prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 4) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, ypred, multi_class='ovr'), 4)
interest_perf.loc[interest_perf.index.size] = ['int_dt1', prec, f1, roc]

default decision tree accuracy: 0.8259838411308046


In [12]:
interest_perf

Unnamed: 0,model,precision,f1,roc
0,lr,"[0.8792, 0.0]","[0.9357, 0.0]",0.5
1,int_lr,"[0.9128, 0.1752]","[0.7537, 0.2661]",0.5977
2,dt,"[0.9046, 0.2933]","[0.9007, 0.3021]",0.6042
3,int_dt1,"[0.9047, 0.2935]","[0.9006, 0.3029]",0.6047


In [14]:
# Try Random Forest
rfc = RandomForestClassifier()
rfc.fit(Xtrain, ytrain)
rfc_score = rfc.score(Xtest, ytest)
print(f'default rfc accuracy: {rfc_score}')

# record
ypred = rfc.predict(Xtest)
prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 3) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, ypred, multi_class='ovr'), 4)
interest_perf.loc[interest_perf.index.size] = ['rfc', prec, f1, roc]

default rfc accuracy: 0.8758360080320016


In [15]:
# Try Gradient Boosting
gbc = GradientBoostingClassifier()
gbc.fit(Xtrain, ytrain)
gbc_score = gbc.score(Xtest, ytest)
print(f'default gbc accuracy: {gbc_score}')

# evaluate the best estimator on the test set
ypred = gbc.predict(Xtest)
yproba = gbc.predict_proba(Xtest)
prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 3) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, ypred, multi_class='ovr'), 4)
interest_perf.loc[interest_perf.index.size] = ['gbc', prec, f1, roc]

default gbc accuracy: 0.8791879456733126


In [28]:
interest_perf

Unnamed: 0,model,precision,f1,roc
0,lr,"[0.8792, 0.0]","[0.9357, 0.0]",0.5
1,int_lr,"[0.9128, 0.1752]","[0.7537, 0.2661]",0.5977
2,dt,"[0.9046, 0.2933]","[0.9007, 0.3021]",0.6042
3,int_dt1,"[0.9047, 0.2935]","[0.9006, 0.3029]",0.6047
4,rfc,"[0.8861, 0.4107]","[0.933, 0.133]",0.532
5,rfc,"[0.8865, 0.4281]","[0.933, 0.138]",0.5337
6,gbc,"[0.8793, 0.5]","[0.936, 0.003]",0.5006


In [26]:
# Numerical columns
numerical_cols = ['Age', 'Vintage']

# categorical column 
cat_col = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']

ss = StandardScaler()
df2[numerical_cols] = ss.fit_transform(df2[numerical_cols])

mm = MinMaxScaler()
df2[['Annual_Premium']] = mm.fit_transform(df2[['Annual_Premium']])

df2.head()


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Premium_Level,Premium_Log
0,1,1,0.360427,1,28.0,0,3,1,0.064329,26.0,0.749751,1,3.0,4.606961
1,2,1,2.411722,1,3.0,0,2,0,0.051376,26.0,0.343434,0,3.0,4.525511
2,3,1,0.552736,1,28.0,0,3,1,0.060285,26.0,-1.52084,1,3.0,4.583131
3,4,1,-1.113941,1,11.0,1,1,0,0.042169,152.0,0.582444,0,3.0,4.456654
4,5,0,-0.601117,1,41.0,1,1,0,0.040066,152.0,-1.377434,0,3.0,4.43927


In [31]:
X, y = df2.drop(columns=['Response']), df2['Response']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, stratify=df2[['Response']], random_state=4)

In [37]:
# training decision tree on default parameters
d2 = DecisionTreeClassifier()
d2.fit(Xtrain, ytrain)

ypred = d2.predict(Xtest)
acc = accuracy_score(ytest, ypred)
print(f'default decision tree accuracy: {acc}')

# performance table
prec = [round(scr, 4) for scr in precision_score(ytest, ypred, average=None)]
f1 = [round(scr, 4) for scr in f1_score(ytest, ypred, average=None)]
roc = round(roc_auc_score(ytest, ypred, multi_class='ovr'), 4)
interest_perf.loc[interest_perf.index.size] = ['d2', prec, f1, roc]

default decision tree accuracy: 0.8275333217385805


In [1]:
DecisionTreeClassifier()

NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
cat_cols

In [42]:
from catboost import CatBoostClassifier

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.22, random_state = 22, stratify = y, shuffle = True)
cat_cols = [0, 2, 3, 4, 5, 6, 8]

modelC = CatBoostClassifier()
modelC = modelC.fit(Xtrain, ytrain, eval_set = (Xtest, ytest), early_stopping_rounds = 10, verbose = 100)

predictions = [pred[1] for pred in modelC.predict_proba(Xtest)]
print('Validation ROC AUC Score:', roc_auc_score(ytest, predictions, average = 'weighted'))

Learning rate set to 0.123526
0:	learn: 0.4893159	test: 0.4896925	best: 0.4896925 (0)	total: 220ms	remaining: 3m 39s
100:	learn: 0.2561352	test: 0.2579724	best: 0.2579724 (100)	total: 7.17s	remaining: 1m 3s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.2576436358
bestIteration = 148

Shrink model to first 149 iterations.
Validation ROC AUC Score: 0.8649294102986619


In [58]:
precision_score(ytest, ypred)

0.5358490566037736

In [54]:
conf_matrix = confusion_matrix(ytest, ypred)


array([[61044,   123],
       [ 8263,   142]], dtype=int64)

In [57]:
tn, fp, fn, tp = conf_matrix.ravel()
ffr = fp / (fp+tn)
ffr

0.002010888224042376

In [None]:
df2['Response'].value_counts()

0    278027
1     38205
Name: Response, dtype: int64

In [53]:
precision_recall_fscore_support(ytest, ypred)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94     55547
           1       0.57      0.00      0.00      7700

    accuracy                           0.88     63247
   macro avg       0.72      0.50      0.47     63247
weighted avg       0.84      0.88      0.82     63247

