In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import time

In [2]:
ls

 dummy_append_test.csv                   sample_submission.csv
 dummy_append_train.csv                  test.csv
'Janatahack Cross-sell Prediction.pdf'   train.csv
 Janta_Hack_Cross_Sell.ipynb


In [3]:
train       = pd.read_csv("train.csv")

In [4]:
train.shape

(381109, 12)

In [5]:
train.dtypes

id                        int64
Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
train["Previously_Insured"].unique()

array([0, 1])

In [8]:
train["Region_Code"].nunique()

53

In [9]:
train["Vehicle_Age"].nunique()

3

In [10]:
train["Policy_Sales_Channel"].nunique()

155

In [11]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [12]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
lb_columns = ["Gender", "Vehicle_Damage"]
for col in lb_columns:
  enc        = LabelEncoder() 
  train[col] = enc.fit_transform(train[col])

In [13]:
## Label Encoding "Vehicle_Age" feature
dictionary = {"< 1 Year":1,"1-2 Year":2,"> 2 Years":3}
train["Vehicle_Age"] = train["Vehicle_Age"].map(dictionary.get)

In [14]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,1,44,1,28.0,0,3,1,40454.0,26.0,217,1
1,2,1,76,1,3.0,0,2,0,33536.0,26.0,183,0
2,3,1,47,1,28.0,0,3,1,38294.0,26.0,27,1
3,4,1,21,1,11.0,1,1,0,28619.0,152.0,203,0
4,5,0,29,1,41.0,1,1,0,27496.0,152.0,39,0


In [15]:
#One hot encoding
ohe_columns = ['Region_Code','Policy_Sales_Channel']

train = pd.get_dummies(train, columns = ohe_columns, drop_first=True)

In [16]:
train.shape

(381109, 216)

In [17]:
# Heuristic approach to create new features by multiplying
# existing columns. This way, you can introduce non-linearity to your data set.
from itertools import combinations
# Should include numerical and One Hot Encoded columns. 
# Be careful before including label encoded columns.
used_cols      = ["Gender", "Age", "Driving_License", "Previously_Insured", 
                  "Vehicle_Damage", "Annual_Premium", "Vintage"]

df             = train.copy()
cc             = list(combinations(used_cols,2))
df_new         = pd.concat([df[c[1]].multiply(df[c[0]]) for c in cc], axis=1, keys=cc)
df_new.columns = df_new.columns.map('_'.join)
df             = pd.concat((df, df_new), axis=1)
train          = df.copy()
del [[df, df_new]]

In [18]:
train.shape

(381109, 237)

In [19]:
train.isnull().sum().sum()

0

In [20]:
# Chi square test for feature selection
from sklearn.feature_selection import chi2

# Label Encoding Needed for chi2 module to work properly
lb_df    = train.copy()   # Copy is important

X = lb_df.drop(['id','Response'],axis=1)
y = lb_df['Response']

chi_scores = chi2(X,y)
p_val      = pd.Series(chi_scores[1],index = X.columns)

# Dependent features according to Chi-Squared Test
# we are using 95% confidence interval for this test
#print("Important fetures are:")
#print("-----------------------------------")
imp_features   = []
unimp_features = []
for col in X.columns:
  if p_val.loc[col] <= 0.05:
    #print(col, "p value:", round(p_val.loc[col],3))
    imp_features.append(col)
  else:
    unimp_features.append(col)

In [21]:
len(unimp_features)

124

In [22]:
train = train.drop(unimp_features, axis =1)

In [23]:
train.shape

(381109, 113)

In [24]:
# Highly imbalanced
train["Response"].value_counts(normalize=True)

0    0.877437
1    0.122563
Name: Response, dtype: float64

In [25]:
# Definig the features and target
X = train.drop(["id", "Response"], axis =1)
y = train["Response"]

In [26]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [27]:
# Scaling The Features
scaler         = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_scaled = y_train.copy()

X_test_scaled  = scaler.transform(X_test)
y_test_scaled  = y_test.copy()

In [28]:
X_smt, y_smt = X_train_scaled.copy(), y_train_scaled.copy()

In [29]:
# SMOTEEN
from imblearn.combine import SMOTEENN
smt          = SMOTEENN(sampling_strategy= 0.5)
X_smt, y_smt = smt.fit_resample(X_train_scaled, y_train_scaled)

y_smt.value_counts(normalize=True)*100

0    71.124024
1    28.875976
Name: Response, dtype: float64

In [30]:
# All Classification Models After SMOTEEN
from sklearn.metrics import roc_auc_score
classifiers = [GaussianNB(), MultinomialNB(),
               SGDClassifier(), KNeighborsClassifier(), DecisionTreeClassifier(),
               RandomForestClassifier(), GradientBoostingClassifier(),
               LGBMClassifier(), XGBClassifier()]

for clf in classifiers:
  print("Solving Model:", str(clf)[:-2], end = ' ')
  start  = time.time()
  model  = clf
  model.fit(X_smt, y_smt)
  y_pred = model.predict(X_test_scaled)
  print("%26s %10s %6.3f"  %(str(clf)[:-2],"AUROC:",roc_auc_score(y_test_scaled,y_pred)) )

  end    = time.time()
  print("Execution Time:",int(end - start), " sec")

Solving Model: GaussianNB                 GaussianNB     AUROC:  0.781
Execution Time: 0  sec
Solving Model: MultinomialNB              MultinomialNB     AUROC:  0.785
Execution Time: 0  sec
Solving Model: SGDClassifier              SGDClassifier     AUROC:  0.787
Execution Time: 1  sec
Solving Model: KNeighborsClassifier       KNeighborsClassifier     AUROC:  0.730
Execution Time: 38  sec
Solving Model: DecisionTreeClassifier     DecisionTreeClassifier     AUROC:  0.676
Execution Time: 9  sec
Solving Model: RandomForestClassifier     RandomForestClassifier     AUROC:  0.724
Execution Time: 62  sec
Solving Model: GradientBoostingClassifier GradientBoostingClassifier     AUROC:  0.779
Execution Time: 245  sec
Solving Model: LGBMClassifier             LGBMClassifier     AUROC:  0.762
Execution Time: 1  sec
Solving Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stop

In [None]:
# Final Model Before Tuning
clf = RandomForestClassifier()
clf.fit(X_smt, y_smt)

submission = pd.read_csv('sample_submission.csv')
final_predictions = clf.predict(X_test_scaled)
submission['Response'] = final_predictions
submission.to_csv('my_submission.csv', index=False)

In [None]:
clf.get_params()

In [None]:
# Random Forest Random Search
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

rs_params = {'max_depth':list(np.arange(10, 100, step=10)) + [None],
              'n_estimators':np.arange(10, 500, step=50),
              'max_features':randint(1,7),
              'criterion':['gini','entropy'],
              'min_samples_leaf':randint(1,4),
              'min_samples_split':np.arange(2, 10, step=2)
          }
rs_cv = RandomizedSearchCV(RandomForestClassifier(), rs_params,
                           cv= 5)

# Train on training data or SMOTEEN
rs_cv.fit(X_smt, y_smt)

# Print the best parameters
print(rs_cv.best_params_)

In [None]:
# Final Model

submission = pd.read_csv('sample_submission.csv')
final_predictions = rs_cv.predict(X_test_scaled)

submission['is_claim'] = final_predictions
submission.to_csv('my_submission.csv', index=False)