In [26]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder


In [27]:
data = pd.read_csv('data/train.csv')

In [28]:
# Removing data having Duration < 0 as duration is one of important feature ,we dont want bad data in model.
data= data[data['Duration'] >= 0]

In [29]:
X = data.iloc[:,1:-1]
y = data.Claim

In [30]:
# Dropping 'Distribution Channel','Agency Type' as their feature importance is low
X = X.drop(['Distribution Channel','Agency Type'],axis=1)  

In [31]:
le = LabelEncoder()
def label_encode(col_name,df):
    df[col] = le.fit_transform(df[col])
    
for col in ['Agency','Product Name','Destination']:
    label_encode(col,X)
X.head()

Unnamed: 0,Agency,Product Name,Duration,Destination,Net Sales,Commision (in value),Age
0,7,10,61,68,12.0,0.0,41
1,7,10,4,53,17.0,0.0,35
2,6,16,26,84,19.8,11.88,47
3,7,1,15,33,27.0,0.0,48
4,7,1,15,53,37.0,0.0,36


In [32]:
# Since there is imbalance in target we need to do SMOTE. 
# imbalance is 0 >> 43590 and 1 >> 8720

smt = SMOTE()
X_new, y_new = smt.fit_sample(X, y)
X_new = pd.DataFrame(data=X_new,columns=list(X))
y_new = pd.Series(data=y_new)

In [33]:
y_new.value_counts()

1    43586
0    43586
Name: Claim, dtype: int64

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, recall_score,precision_score
from sklearn.model_selection import train_test_split as tts,GridSearchCV

In [35]:
X_train,X_test,y_train,y_test = tts(X,y,random_state=43,test_size = 0.25)

In [36]:
rfc= RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=58, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=10, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [61]:
# rfc = RandomForestClassifier(max_depth=50)
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=76, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=10, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [62]:
scores = rfc.feature_importances_

In [63]:
feature_imp = pd.DataFrame({'features':list(X),'score':scores})
feature_imp.sort_values(['score'],ascending=False)

Unnamed: 0,features,score
4,Net Sales,0.219939
2,Duration,0.193089
5,Commision (in value),0.151903
0,Agency,0.130318
3,Destination,0.126628
6,Age,0.124055
1,Product Name,0.054069


In [64]:
y_pred = rfc.predict(X_test)
accuracy_score(y_test,y_pred)

0.9377533073334863

In [65]:
confusion_matrix(y_test,y_pred)

array([[10589,   340],
       [  474,  1674]])

In [66]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     10929
           1       0.83      0.78      0.80      2148

    accuracy                           0.94     13077
   macro avg       0.89      0.87      0.88     13077
weighted avg       0.94      0.94      0.94     13077



In [67]:
rfc

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=76, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=10, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [68]:
# Applying GrifSearchCV
params = {'class_weight':['balanced','balanced_subsample'],'criterion':['gini','entropy'],
          'max_depth':np.arange(70,80,2)}

In [57]:
# GridSearchCV with default midel, params, and cv = 10
rfc_cv = GridSearchCV(estimator=RandomForestClassifier(n_jobs=10),param_grid=params,cv = 10)
rfc_cv.fit(X_new,y_new)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=10,
                                              oob_score=False,
                                              random

In [58]:
rfc_cv.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=76, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=10, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [59]:
rfc_cv.best_score_

0.9671572909864393

In [60]:
rfc = rfc_cv.best_estimator_