In [1]:
# import packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
import numpy as np
import pandas as pd
import os
import sys
from sklearn.model_selection import train_test_split

In [2]:
# create data path
DATA_PATH = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,os.path.pardir,'data','interim'))
RAW_DATA_PATH = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,os.path.pardir,'data','raw'))

In [3]:
# read dataframe
val_original = pd.read_csv(os.path.join(RAW_DATA_PATH,'val_set.csv'))
df_train = pd.read_csv(os.path.join(DATA_PATH, 'train_set.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'test_set.csv'))
df_validation = pd.read_csv(os.path.join(DATA_PATH, 'val_set.csv')) # just for bakcup

In [4]:
# function to fit model and show f1 score
def fit_model(df,model, model_name):
    print(model_name)
    print('-'*40)
    df=df.drop(['Unnamed: 0'],axis=1)
    X = df.drop('Claim',axis=1)
    y = df['Claim']
    crs_val_res = cross_validate(model,X,y,scoring =  'f1_macro',cv=5)
    print(crs_val_res)
    crs_val_prc = crs_val_res['test_score'].mean()
    crs_val_prc = crs_val_res['train_score'].mean()
    print('test_score:\t',crs_val_prc)
    print('train_score:\t',crs_val_prc)
    model.fit(X,y)
    return model

In [5]:
# X,y split of validation and test data
X_val = df_validation.drop(['Claim','Unnamed: 0'],axis=1)
y_val = df_validation['Claim']
X_test = df_test.drop(['Claim','Unnamed: 0'],axis=1)
y_test = df_test['Claim']

In [6]:
# record initial time
import time
start = time.time()

In [7]:
# apply Logistic Regression
model = LogisticRegression()
model_name = 'LogisticRegression'
model = fit_model(df_train,model, model_name)
y_pred =  model.predict(X_val)
print(f1_score(y_val,y_pred,average='macro'))

LogisticRegression
----------------------------------------




{'fit_time': array([0.43336153, 0.46822023, 0.55544162, 0.46956229, 0.48122525]), 'score_time': array([0.00210786, 0.00568795, 0.00605774, 0.01008916, 0.        ]), 'test_score': array([0.77242992, 0.76254966, 0.77261012, 0.77050442, 0.7646143 ]), 'train_score': array([0.76737548, 0.77194421, 0.76815706, 0.76825105, 0.77100212])}
test_score:	 0.7693459854784216
train_score:	 0.7693459854784216




0.5181717719831844


In [8]:
# record end time
end = time.time()
print('time:taken:',end-start)

time:taken: 3.3408000469207764


In [9]:
# record start time
start = time.time()

In [10]:
# apply decision tree
model = DecisionTreeClassifier()
model_name = 'DecisionTreeClassifier'
model = fit_model(df_train,model, model_name)
y_pred =  model.predict(X_val)
print(f1_score(y_val,y_pred,average='macro'))

DecisionTreeClassifier
----------------------------------------
{'fit_time': array([0.44214463, 0.47784948, 0.53126669, 0.51266646, 0.49498177]), 'score_time': array([0.01004529, 0.00955844, 0.009974  , 0.00794506, 0.00897932]), 'test_score': array([0.93978008, 0.97029968, 0.97002388, 0.96998741, 0.96988803]), 'train_score': array([0.99997496, 0.99864657, 0.99864657, 0.99852114, 0.99867168])}
test_score:	 0.9988921851629767
train_score:	 0.9988921851629767




0.4810836854891075


In [11]:
# record end time
end = time.time()
print('time:taken:',end-start)

time:taken: 3.44522762298584


In [12]:
# record start time
start = time.time()

In [13]:
# apply random forest
model = RandomForestClassifier()
model_name = 'RandomForestClassifier'
model = fit_model(df_train,model, model_name)
y_pred =  model.predict(X_val)
print(f1_score(y_val,y_pred,average='macro'))

RandomForestClassifier
----------------------------------------




{'fit_time': array([0.57105446, 0.61985135, 0.6071806 , 0.5789001 , 0.58399725]), 'score_time': array([0.03085184, 0.02871442, 0.01555538, 0.03123665, 0.02592897]), 'test_score': array([0.93667531, 0.98141711, 0.97769841, 0.97999635, 0.97759788]), 'train_score': array([0.99922356, 0.9964132 , 0.99671552, 0.99646412, 0.99636447])}
test_score:	 0.9970361752084772
train_score:	 0.9970361752084772




0.5362929061784897


In [14]:
# record end time
end = time.time()
print('time:taken:',end-start)

time:taken: 4.523570537567139


## Observation
1. In logistic regression, decision tree and random forest the f1 score of train set is greater than f1 score of test set. These are classic cases of overfitting.
3. We must try another linear model like linear svc just to compare with logistic regression.

In [15]:
# apply linear SVC
model = LinearSVC()
model_name = 'LinearSVC'
model = fit_model(df_train,model, model_name)
y_pred =  model.predict(X_val)
print(f1_score(y_val,y_pred,average='macro'))

LinearSVC
----------------------------------------




{'fit_time': array([4.52055383, 4.39569139, 4.5003407 , 4.29217005, 4.30210328]), 'score_time': array([0.00598717, 0.        , 0.        , 0.00502658, 0.00601935]), 'test_score': array([0.67138817, 0.73590333, 0.74438116, 0.66236139, 0.54271927]), 'train_score': array([0.66206738, 0.74546277, 0.73946741, 0.6635446 , 0.54543945])}
test_score:	 0.6711963211851214
train_score:	 0.6711963211851214
0.5028539567351153




## Observation
1. After comparing Logistic regression and linear svc, we conclude that logistic regression is best model as diffrence between training and validation score is lesser.
2. To undestand more about the behaviour of models lets analyze the errors made by Logistic regression and Decision trees

In [16]:
# apply Logistic regression
model = LogisticRegression()
model_name = 'LogisticRegression'
model = fit_model(df_train,model, model_name)
y_pred =  model.predict(X_val)
print(f1_score(y_val,y_pred,average='macro'))

# findout probability of each val set
probas = model.predict_proba(X_val)



LogisticRegression
----------------------------------------




{'fit_time': array([0.41970992, 0.45346999, 0.55235577, 0.50908971, 0.5136261 ]), 'score_time': array([0.        , 0.01560736, 0.0156498 , 0.0059886 , 0.00498676]), 'test_score': array([0.77242992, 0.76254966, 0.77261012, 0.77050442, 0.7646143 ]), 'train_score': array([0.76737548, 0.77194421, 0.76815706, 0.76825105, 0.77100212])}
test_score:	 0.7693459854784216
train_score:	 0.7693459854784216




0.5181717719831844


In [17]:
# split probability of claim=0 and claim=1
l1 = []
l2 = []
for a in list(probas):
    l1.append(a[0])
    l2.append(a[1])

In [18]:
# classes considered by Logistic regression
model.classes_

array([0, 1], dtype=int64)

In [19]:
# make copy of validation data
y_val_new = y_val.copy()

In [20]:
# prepare a dtaframe with all info about errors to analyze
y_val = pd.DataFrame(y_val)
y_val.head()

Unnamed: 0,Claim
0,0
1,0
2,0
3,0
4,0


In [21]:
# add predicted data to y_val
y_val['pred_y'] = pd.Series(y_pred)

In [22]:
# add probabilities of each row to y_val
y_val['0_proba'] = pd.Series(l1)
y_val['1_proba'] = pd.Series(l2)

In [23]:
# error records 
y_err = y_val[y_val['Claim']!=y_val['pred_y']]

In [24]:
y_err.Claim.value_counts()

0    1746
1      68
Name: Claim, dtype: int64

In [25]:
# split error when claim ==1 and error when claime == 0 and merge with validation Xs
err_1 = y_err[y_err['Claim']==1].reset_index()
err_1 = err_1.merge(val_original.reset_index(),on='index')
err_1.to_csv('err_1.csv')

err_0 = y_err[y_err['Claim']==0].reset_index()
err_0 = err_0.merge(val_original.reset_index(),on='index')
err_0.to_csv('err_0.csv')

# Error analysis of Deciosion trees

In [26]:
# apply decision tree
model = DecisionTreeClassifier()
model_name = 'DecisionTreeClassifier'
model = fit_model(df_train,model, model_name)
y_pred =  model.predict(X_val)
print(f1_score(y_val_new,y_pred,average='macro'))

# create analysis dataframe for analyzing the errors
probas = model.predict_proba(X_val)
l1 = []
l2 = []
for a in list(probas):
    l1.append(a[0])
    l2.append(a[1])

print(model.classes_)

y_val_tree = y_val_new.copy()
y_val_tree = pd.DataFrame(y_val_tree)
y_val_tree['pred_y'] = pd.Series(y_pred)
y_val_tree['0_proba'] = pd.Series(l1)
y_val_tree['1_proba'] = pd.Series(l2)
# error records 
y_err_tree = y_val_tree[y_val_tree['Claim']!=y_val_tree['pred_y']]
print(y_err.Claim.value_counts())
err_1 = y_err_tree[y_err_tree['Claim']==1].reset_index()
err_1 = err_1.merge(val_original.reset_index(),on='index')
err_1.to_csv('err_tree_1.csv')
err_0 = y_err[y_err['Claim']==0].reset_index()
err_0 = err_0.merge(val_original.reset_index(),on='index')
err_0.to_csv('err_tree_0.csv')

DecisionTreeClassifier
----------------------------------------
{'fit_time': array([0.42639589, 0.46775079, 0.4989078 , 0.48685241, 0.48425794]), 'score_time': array([0.0089767 , 0.0089736 , 0.01562071, 0.01563263, 0.        ]), 'test_score': array([0.93952906, 0.96960241, 0.96972315, 0.97008907, 0.96958014]), 'train_score': array([0.99997496, 0.99864657, 0.99864657, 0.99852114, 0.99867168])}
test_score:	 0.9988921851629767
train_score:	 0.9988921851629767




0.4832499074558902
[0 1]
0    1746
1      68
Name: Claim, dtype: int64


# common errors made by both models

In [27]:
y_err_tree.reset_index(inplace=True)

In [28]:
y_err_tree.head()

Unnamed: 0,index,Claim,pred_y,0_proba,1_proba
0,6,0,1,0.0,1.0
1,22,0,1,0.0,1.0
2,31,0,1,0.0,1.0
3,34,0,1,0.0,1.0
4,38,1,0,1.0,0.0


In [29]:
y_err.reset_index(inplace = True)
y_err.head()

Unnamed: 0,index,Claim,pred_y,0_proba,1_proba
0,12,0,1,0.447287,0.552713
1,34,0,1,0.184854,0.815146
2,38,1,0,0.82515,0.17485
3,48,0,1,0.40287,0.59713
4,49,0,1,0.041933,0.958067


In [30]:
common_errs = y_err_tree.join(y_err, on='index',lsuffix='tree_',how='inner')

In [31]:
print('common errors:',len(common_errs))
print('tree errors:',len(y_err_tree))
print('lr errors:',len(y_err))

common errors: 297
tree errors: 1955
lr errors: 1814


**Insights**
1. only 219 errors are common.


## lets tune both linear regression and decision tree. since most of their errors are different from one another we can have a voting classifier

# tuning of Logistic regression

In [32]:
# function to split dataframe into X and y
def prepare_xy(df):
    df=df.drop(['Unnamed: 0'],axis=1)
    X = df.drop('Claim',axis=1)
    y = df['Claim']
    return X,y

In [33]:
X,y = prepare_xy(df_train)

In [34]:
# turne Logistic Regression
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'C': [1.0,0.7,0.4,0.2], 'class_weight': [None,'balanced']}
]
model = LogisticRegression()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X,y)





GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'C': [1.0, 0.7, 0.4, 0.2], 'class_weight': [None, 'balanced']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [35]:
grid_search.best_params_

{'C': 0.4, 'class_weight': 'balanced'}

In [36]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

0.7685417887984651 {'C': 1.0, 'class_weight': None}
0.7768625784161537 {'C': 1.0, 'class_weight': 'balanced'}
0.768375902299145 {'C': 0.7, 'class_weight': None}
0.7767660354694212 {'C': 0.7, 'class_weight': 'balanced'}
0.7681542380182959 {'C': 0.4, 'class_weight': None}
0.777234914506086 {'C': 0.4, 'class_weight': 'balanced'}
0.7666955739196789 {'C': 0.2, 'class_weight': None}
0.777115321159856 {'C': 0.2, 'class_weight': 'balanced'}


**Insights** since C is on border lets try grid search cv more

In [37]:
param_grid = [
    {'C': [0.27,0.25,0.2,0.1], 'class_weight': ['balanced']}
]
model = LogisticRegression()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X,y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'C': [0.27, 0.25, 0.2, 0.1], 'class_weight': ['balanced']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [38]:
grid_search.best_params_

{'C': 0.25, 'class_weight': 'balanced'}

In [39]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

0.7769334717323773 {'C': 0.27, 'class_weight': 'balanced'}
0.7771922791265806 {'C': 0.25, 'class_weight': 'balanced'}
0.777115321159856 {'C': 0.2, 'class_weight': 'balanced'}
0.7760622812006148 {'C': 0.1, 'class_weight': 'balanced'}


**Insights**
1. Final lr model has C=0.2 and class_weight = 'balanced'

In [40]:
model = LogisticRegression(C = 0.25, class_weight = 'balanced')
model.fit(X,y)
y_pred = model.predict(X_val)
print(f1_score(y_val_new,y_pred,average='macro'))



0.4947151529647556


In [41]:
# analyse error of tuned model

probas = model.predict_proba(X_val)

l1 = []
l2 = []
for a in list(probas):
    l1.append(a[0])
    l2.append(a[1])

print(model.classes_)

y_val_lr = y_val_new.copy()
y_val_lr = pd.DataFrame(y_val_tree)
y_val_lr['pred_y'] = pd.Series(y_pred)
y_val_lr['0_proba'] = pd.Series(l1)
y_val_lr['1_proba'] = pd.Series(l2)
# error records 
y_err_lr = y_val_lr[y_val_lr['Claim']!=y_val_lr['pred_y']]
print(y_err_lr.Claim.value_counts())
err_1 = y_err_lr[y_err_lr['Claim']==1].reset_index()
err_1 = err_1.merge(val_original.reset_index(),on='index')
err_1.to_csv('err_lr_1.csv')
err_0 = y_err_lr[y_err_lr['Claim']==0].reset_index()
err_0 = err_0.merge(val_original.reset_index(),on='index')
err_0.to_csv('err_lr_0.csv')

[0 1]
0    2443
1      47
Name: Claim, dtype: int64


In [42]:
y_err_lr.reset_index(inplace = True)
common_errs = y_err_lr.join(y_err, on='index',lsuffix='lr_',how='inner')
print('common errors:',len(common_errs))
print('lr errors:',len(y_err_lr))
print('lr untuned errors:',len(y_err))

common errors: 374
lr errors: 2490
lr untuned errors: 1814


After tuning lr is making new kinds of errors while only 232 errors are same

# Hyper parameter tuning decision tree

In [43]:
# findout criterion first
param_grid = [
    {'criterion':['gini','entropy']}
]
model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'criterion': ['gini', 'entropy']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [44]:
grid_search.best_params_

{'criterion': 'entropy'}

In [45]:
param_grid = [
    {'max_depth':[5,7],'min_samples_split':[50,25,15]}
]
model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'max_depth': [5, 7], 'min_samples_split': [50, 25, 15]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [46]:
grid_search.best_params_

{'max_depth': 7, 'min_samples_split': 15}

In [47]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

0.8310361758231846 {'max_depth': 5, 'min_samples_split': 50}
0.8311067022207412 {'max_depth': 5, 'min_samples_split': 25}
0.8311067022207412 {'max_depth': 5, 'min_samples_split': 15}
0.8476135060970558 {'max_depth': 7, 'min_samples_split': 50}
0.8479186595140831 {'max_depth': 7, 'min_samples_split': 25}
0.8479947397252188 {'max_depth': 7, 'min_samples_split': 15}


In [48]:
model = DecisionTreeClassifier(max_depth=7,min_samples_split=15,criterion='entropy')
model.fit(X,y)
y_pred = model.predict(X_val)
print(f1_score(y_val_new,y_pred,average='macro'))

0.5326088717589601


In [58]:
probas = model.predict_proba(X_val)

l1 = []
l2 = []
for a in list(probas):
    l1.append(a[0])
    l2.append(a[1])

print(model.classes_)

y_val_rtuned = y_val_new.copy()
y_val_rtuned = pd.DataFrame(y_val_rtuned)
y_val_rtuned['pred_y'] = pd.Series(y_pred)
y_val_rtuned['0_proba'] = pd.Series(l1)
y_val_rtuned['1_proba'] = pd.Series(l2)
# error records 
y_err_tuned = y_val_rtuned[y_val_rtuned['Claim']!=y_val_rtuned['pred_y']]
print(y_err_tuned.Claim.value_counts())
err_1 = y_err_tuned[y_err_tuned['Claim']==1].reset_index()
err_1 = err_1.merge(val_original.reset_index(),on='index')
err_1.to_csv('err_dtree_tuned_1.csv')
err_0 = y_err_tuned[y_err_tuned['Claim']==0].reset_index()
err_0 = err_0.merge(val_original.reset_index(),on='index')
err_0.to_csv('err_dtree_tuned_0.csv')

[0 1]
0    572
1    135
Name: Claim, dtype: int64


In [50]:
y_err_tuned.reset_index(inplace = True)
common_errs = y_err_tuned.join(y_err_tree, on='index',lsuffix='lr_',how='inner')
print('common errors:',len(common_errs))
print('dtree errors:',len(y_err_tree))
print('dtree tuned errors:',len(y_err_tuned))

common errors: 226
dtree errors: 1955
dtree tuned errors: 1476


## Analysis:
After analysing the errors since all 4 models are making almost different types of errors, we can build a voting classifier for that

In [51]:
from sklearn.ensemble import VotingClassifier

In [52]:
lr = LogisticRegression()
lr_tuned = LogisticRegression(C = 0.25, class_weight = 'balanced')
dtree = DecisionTreeClassifier()
dtree_tuned = DecisionTreeClassifier(max_depth=7,min_samples_split=15,criterion='entropy')

In [53]:
voting_clf = VotingClassifier(
    estimators =[('lr',lr),('lr_tuned',lr_tuned),('dtree',dtree),('dtree_tuned',dtree_tuned)],
    voting = 'hard'
)

In [54]:
voting_clf.fit(X,y)



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)), ('lr_tuned', Lo...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [55]:
y_pred = voting_clf.predict(X_val)
print(f1_score(y_val_new,y_pred,average='macro'))

0.5337293065151496


In [56]:
# After trying whole lot of things looks like random forest regressor is best estmator

In [66]:
final_model = RandomForestClassifier()
final_model.fit(X,y)
y_pred = final_model.predict(X_test)
print(f1_score(y_test,y_pred,average='macro'))



0.5508698971900439
