In [1]:
from google.colab import files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import time

In [2]:
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv
Saving sample_submission.csv to sample_submission.csv


In [None]:
ls

[0m[01;34msample_data[0m/  sample_submission.csv  test.csv  train.csv


In [None]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

In [None]:
train.shape, test.shape

((58592, 44), (39063, 43))

In [None]:
train.dtypes

policy_id                            object
policy_tenure                       float64
age_of_car                          float64
age_of_policyholder                 float64
area_cluster                         object
population_density                    int64
make                                  int64
segment                              object
model                                object
fuel_type                            object
max_torque                           object
max_power                            object
engine_type                          object
airbags                               int64
is_esc                               object
is_adjustable_steering               object
is_tpms                              object
is_parking_sensors                   object
is_parking_camera                    object
rear_brakes_type                     object
displacement                          int64
cylinder                              int64
transmission_type               

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Expanding "max_torque" feature
train["torque"] = train["max_torque"].str.split("Nm@|rpm", expand=True)[0].astype(float)
train["rpm"]    = train["max_torque"].str.split("Nm@|rpm", expand=True)[1].astype(float)

test["torque"] = test["max_torque"].str.split("Nm@|rpm", expand=True)[0].astype(float)
test["rpm"]    = test["max_torque"].str.split("Nm@|rpm", expand=True)[1].astype(float)

# Expanding "max_power" feature
train["power"] = train["max_power"].str.split("bhp@|rpm", expand=True)[0].astype(float)
train["rpm_2"] = train["max_power"].str.split("bhp@|rpm", expand=True)[1].astype(float)

test["power"] = test["max_power"].str.split("bhp@|rpm", expand=True)[0].astype(float)
test["rpm_2"] = test["max_power"].str.split("bhp@|rpm", expand=True)[1].astype(float)

# New Features
train["age_complex"] = train["policy_tenure"]*train["age_of_car"]*train["age_of_policyholder"]
train["area"]        = train["length"]*train["width"]
train["volume"]      = train["length"]*train["width"]*train["height"]

test["age_complex"] = test["policy_tenure"]*test["age_of_car"]*test["age_of_policyholder"]
test["area"]        = test["length"]*test["width"]
test["volume"]      = test["length"]*test["width"]*test["height"]

In [None]:
train.head(2)

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim,torque,rpm,power,rpm_2,age_complex,area,volume
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0,0,60.0,3500.0,40.36,6000.0,0.016617,5219175,7698283125
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0,0,60.0,3500.0,40.36,6000.0,0.005045,5219175,7698283125


In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

lb_columns = ["area_cluster", "segment", "model", "fuel_type", 
              "max_torque", "max_power", "engine_type", "is_esc",
              "is_adjustable_steering", "is_tpms", "is_parking_sensors",
              "is_parking_camera", "rear_brakes_type", "transmission_type",
              "steering_type", "is_front_fog_lights", "is_rear_window_wiper",
              "is_rear_window_washer", "is_rear_window_defogger", "is_brake_assist",
              "is_power_door_locks", "is_central_locking", "is_power_steering",
              "is_driver_seat_height_adjustable", "is_day_night_rear_view_mirror",
              "is_ecw", "is_speed_alert"]

for col in lb_columns:
  lb = LabelEncoder() 
  train[col] = lb.fit_transform(train[col])
  test[col]  = lb.transform(test[col])

In [None]:
train.shape, test.shape

((58592, 51), (39063, 50))

In [None]:
train.head(1)

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim,torque,rpm,power,rpm_2,age_complex,area,volume
0,ID00001,0.515874,0.05,0.644231,0,4990,1,0,0,0,5,2,6,2,0,0,0,1,0,1,796,3,1,5,2,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0,60.0,3500.0,40.36,6000.0,0.016617,5219175,7698283125


In [None]:
#One hot encoding
# This can be done through the pipeline but I preferred to follow the tutorial
ohe_columns = ['area_cluster','make','segment','model','fuel_type',
               'max_torque','max_power', 'engine_type','steering_type']

train = pd.get_dummies(train, columns = ohe_columns, drop_first=True)
test  = pd.get_dummies(test,  columns = ohe_columns, drop_first=True)

In [None]:
train.shape, test.shape

((58592, 112), (39063, 111))

In [None]:
# Highly imbalanced
train["is_claim"].value_counts(normalize=True)

0    0.936032
1    0.063968
Name: is_claim, dtype: float64

In [None]:
# Definig the features and target
X_train = train.drop(["is_claim", "policy_id"], axis =1)
y_train = train["is_claim"]

X_test  = test.drop(["policy_id"], axis =1)

In [None]:
# Scaling The Features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_scaled = y_train.copy()

X_test_scaled  = scaler.transform(X_test)

In [None]:
# SMOTEENN
from imblearn.combine import SMOTEENN
smt = SMOTEENN(sampling_strategy='all')
X_smt, y_smt = smt.fit_resample(X_train_scaled, y_train_scaled)

y_smt.value_counts(normalize=True)*100

1    55.280072
0    44.719928
Name: is_claim, dtype: float64

In [None]:
# All Classification Models After SMOTEEN
classifiers = [GaussianNB(), MultinomialNB(),
               SGDClassifier(), KNeighborsClassifier(), DecisionTreeClassifier(),
               RandomForestClassifier(), GradientBoostingClassifier(),
               LGBMClassifier(), XGBClassifier()]

score = []
for clf in classifiers:
  print("Solving Model:", str(clf)[:-2], end = ' ')
  start  = time.time()
  model  = clf
  model.fit(X_smt, y_smt)

  val_f1 = np.mean(cross_val_score(model, X_smt, y_smt, 
                                   cv=5, scoring="f1"))
  end    = time.time()
  print("Execution Time:",int(end - start), " sec")
  score.append([val_f1,str(clf)[:-2]])
  
score = sorted(score,reverse=True)
for i in range(len(score)):
  print("%26s %15s %6.3f"  %(score[i][1],"Validation F1:",score[i][0]))

Solving Model: GaussianNB Execution Time: 1  sec
Solving Model: MultinomialNB Execution Time: 0  sec
Solving Model: SGDClassifier Execution Time: 12  sec
Solving Model: KNeighborsClassifier Execution Time: 122  sec
Solving Model: DecisionTreeClassifier Execution Time: 14  sec
Solving Model: RandomForestClassifier Execution Time: 96  sec
Solving Model: GradientBoostingClassifier Execution Time: 289  sec
Solving Model: LGBMClassifier Execution Time: 14  sec
Solving Model: XGBClassifier Execution Time: 98  sec
    RandomForestClassifier  Validation F1:  0.966
      KNeighborsClassifier  Validation F1:  0.954
    DecisionTreeClassifier  Validation F1:  0.931
            LGBMClassifier  Validation F1:  0.929
             XGBClassifier  Validation F1:  0.865
GradientBoostingClassifier  Validation F1:  0.865
             SGDClassifier  Validation F1:  0.693
                GaussianNB  Validation F1:  0.672
             MultinomialNB  Validation F1:  0.618


In [None]:
# Final Model
clf = RandomForestClassifier()
clf.fit(X_smt, y_smt)

submission = pd.read_csv('sample_submission.csv')
final_predictions = clf.predict(X_test_scaled)

submission['is_claim'] = final_predictions
submission.to_csv('my_submission.csv', index=False)

In [None]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
# Random Forest Random Search
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

rs_params = {'max_depth':list(np.arange(10, 100, step=10)) + [None],
              'n_estimators':np.arange(10, 500, step=50),
              'max_features':randint(1,7),
              'criterion':['gini','entropy'],
              'min_samples_leaf':randint(1,4),
              'min_samples_split':np.arange(2, 10, step=2)
          }
rs_cv = RandomizedSearchCV(RandomForestClassifier(), rs_params,
                           cv= 5)

# Train on training data or SMOTEEN
rs_cv.fit(X_smt, y_smt)

# Print the best parameters
print(rs_cv.best_params_)

{'criterion': 'gini', 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 210}


In [None]:
# Final Model

submission = pd.read_csv('sample_submission.csv')
final_predictions = rs_cv.predict(X_test_scaled)

submission['is_claim'] = final_predictions
submission.to_csv('my_submission.csv', index=False)

In [None]:
# Chi square test for feature selection
from sklearn.feature_selection import chi2

# Label Encoding Needed for chi2 module to work properly
lb_df    = train.copy()   # Copy is important

X = lb_df.drop(['policy_id','is_claim'],axis=1)
y = lb_df['is_claim']

chi_scores = chi2(X,y)
p_val      = pd.Series(chi_scores[1],index = X.columns)

# Dependent features according to Chi-Squared Test
# we are using 95% confidence interval for this test
print("Important fetures are:")
print("-----------------------------------")
imp_features   = []
unimp_features = []
for col in X.columns:
  if p_val.loc[col] <= 0.05:
    #print(col, "p value:", round(p_val.loc[col],3))
    imp_features.append(col)
  else:
    unimp_features.append(col)

Important fetures are:
-----------------------------------


In [None]:
imp_features

['policy_tenure',
 'area_cluster',
 'population_density',
 'model',
 'max_torque',
 'max_power',
 'is_adjustable_steering',
 'displacement',
 'steering_type',
 'length',
 'width',
 'gross_weight',
 'torque',
 'rpm',
 'power',
 'area',
 'volume']

In [None]:
unimp_features

['age_of_car',
 'age_of_policyholder',
 'make',
 'segment',
 'fuel_type',
 'engine_type',
 'airbags',
 'is_esc',
 'is_tpms',
 'is_parking_sensors',
 'is_parking_camera',
 'rear_brakes_type',
 'cylinder',
 'transmission_type',
 'gear_box',
 'turning_radius',
 'height',
 'is_front_fog_lights',
 'is_rear_window_wiper',
 'is_rear_window_washer',
 'is_rear_window_defogger',
 'is_brake_assist',
 'is_power_door_locks',
 'is_central_locking',
 'is_power_steering',
 'is_driver_seat_height_adjustable',
 'is_day_night_rear_view_mirror',
 'is_ecw',
 'is_speed_alert',
 'ncap_rating',
 'rpm_2',
 'age_complex']

In [None]:
train = train.drop(unimp_features, axis =1)
test  = test.drop(unimp_features, axis =1)

In [None]:
train.head(2)

Unnamed: 0,policy_id,policy_tenure,area_cluster,population_density,model,max_torque,max_power,is_adjustable_steering,displacement,steering_type,length,width,gross_weight,is_claim,torque,rpm,power,area,volume
0,ID00001,0.515874,0,4990,0,5,2,0,796,2,3445,1515,1185,0,60.0,3500.0,40.36,5219175,7698283125
1,ID00002,0.672619,11,27003,0,5,2,0,796,2,3445,1515,1185,0,60.0,3500.0,40.36,5219175,7698283125


In [None]:
train.head(2)

Unnamed: 0,policy_id,policy_tenure,population_density,is_adjustable_steering,displacement,length,width,gross_weight,is_claim,torque,rpm,power,area,volume,area_cluster_1,area_cluster_2,area_cluster_3,area_cluster_4,area_cluster_5,area_cluster_6,area_cluster_7,area_cluster_8,area_cluster_9,area_cluster_10,area_cluster_11,area_cluster_12,area_cluster_13,area_cluster_14,area_cluster_15,area_cluster_16,area_cluster_17,area_cluster_18,area_cluster_19,area_cluster_20,area_cluster_21,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9,model_10,max_torque_1,max_torque_2,max_torque_3,max_torque_4,max_torque_5,max_torque_6,max_torque_7,max_torque_8,max_power_1,max_power_2,max_power_3,max_power_4,max_power_5,max_power_6,max_power_7,max_power_8,steering_type_1,steering_type_2
0,ID00001,0.515874,4990,0,796,3445,1515,1185,0,60.0,3500.0,40.36,5219175,7698283125,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
1,ID00002,0.672619,27003,0,796,3445,1515,1185,0,60.0,3500.0,40.36,5219175,7698283125,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1


In [None]:
train.shape, test.shape

((58592, 63), (39063, 62))

In [None]:
# Rechecking the dtypes
# train.dtypes, test.dtypes

In [None]:
# Manually Checking if df staructure is correct or not
# train.drop("is_claim", axis=1, inplace=True)
# train_cols = list(train.columns)
# test_cols  = list(test.columns)
# cols = pd.DataFrame([train_cols,test_cols])
# cols

In [None]:
# All Classification Models Without SMOTEENN
classifiers = [GaussianNB(), MultinomialNB(),
               SGDClassifier(), KNeighborsClassifier(), DecisionTreeClassifier(),
               RandomForestClassifier(), GradientBoostingClassifier(),
               LGBMClassifier(), XGBClassifier()]

score = []
for clf in classifiers:
  print("Solving Model:", str(clf)[:-2], end = ' ')
  start  = time.time()
  model  = clf
  model.fit(X_train_scaled, y_train)
  val_f1 = np.mean(cross_val_score(model, X_train_scaled, y_train, 
                                  cv=5, scoring="f1"))
  end    = time.time()
  print("Execution Time:",int(end - start), " sec")
  score.append([val_f1,str(clf)[:-2]])
  
score = sorted(score,reverse=True)
for i in range(len(score)):
  print("%26s %15s %6.3f"  %(score[i][1],"Validation F1:",score[i][0]))

In [None]:
# Final Model Without SMOTEENN
clf_1 = RandomForestClassifier()
clf_1.fit(X_train_scaled, y_train)

sub_no_sm         = pd.read_csv('sample_submission.csv')
final_pred_no_sm  = clf_1.predict(X_test_scaled)

sub_no_sm['is_claim'] = final_pred_no_sm
sub_no_sm.to_csv('my_submission_no_sm.csv', index=False)

1    55.34037
0    44.65963
Name: is_claim, dtype: float64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)