In [None]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
test_id = df_test.id.values
train_id = df_train.id.values
df_train.drop(['id'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

In [None]:
mp = {'Male':0, 'Female':1}
mp_2 = {'Yes':0, 'No':1}
mp_3 = {'< 1 Year': 0, '1-2 Year':1, '> 2 Years':2}

df_train['Gender'] = df_train['Gender'].map(mp)
df_test['Gender'] = df_test['Gender'].map(mp)

df_train['Vehicle_Damage'] = df_train['Vehicle_Damage'].map(mp_2)
df_test['Vehicle_Damage'] = df_test['Vehicle_Damage'].map(mp_2)


df_train['Vehicle_Age'] = df_train['Vehicle_Age'].map(mp_3)
df_test['Vehicle_Age'] = df_test['Vehicle_Age'].map(mp_3)

In [None]:
df_train['Age_premium'] = df_train['Age']/df_train['Annual_Premium']
df_test['Age_premium'] = df_test['Age']/df_test['Annual_Premium']

df_train['Prev_insured_vehicle_damage'] = df_train['Previously_Insured'].astype(str) + "_" +  df_train['Vehicle_Damage'].astype(str)
df_test['Prev_insured_vehicle_damage'] = df_test['Previously_Insured'].astype(str) + "_" + df_test['Vehicle_Damage'].astype(str)

In [None]:
#age limit

df_train['Age_limit'] = [1 if i<18 or i>65 else 0 for i in df_train['Age'].values]
df_test['Age_limit'] = [1 if i<18 or i>65 else 0 for i in df_test['Age'].values]

In [None]:
#find average premium with respect to region code

region = df_train['Policy_Sales_Channel'].values
print(len(region))
region = np.append(region,df_test['Policy_Sales_Channel'].values)
print(len(region))

ap = df_train['Annual_Premium'].values
print(len(ap))
ap = np.append(ap,df_test['Annual_Premium'].values)
print(len(ap))

df_temp = pd.DataFrame({'Policy_Sales_Channel':region, 'Annual_Premium':ap})

k = df_temp.groupby('Policy_Sales_Channel')['Annual_Premium'].mean()
index = k.index.values
print(index)
values = k.values
print(values)

map_reg = {}

for i in range(len(index)):
  map_reg[index[i]] = values[i]

df_train['Policy_premium'] = df_train['Policy_Sales_Channel'].map(map_reg)
df_test['Policy_premium'] = df_test['Policy_Sales_Channel'].map(map_reg)

In [None]:
#find average experience with respect to region code

region = df_train['Age'].values
print(len(region))
region = np.append(region,df_test['Age'].values)
print(len(region))

ap = df_train['Vehicle_Damage'].values
print(len(ap))
ap = np.append(ap,df_test['Vehicle_Damage'].values)
print(len(ap))

df_temp = pd.DataFrame({'Age':region, 'Vehicle_Damage':ap})

k = df_temp.groupby('Age')['Vehicle_Damage'].mean()
index = k.index.values
print(index)
values = k.values
print(values)

map_reg = {}

for i in range(len(index)):
  map_reg[index[i]] = values[i]

df_train['Experience'] = df_train['Age'].map(map_reg)
df_test['Experience'] = df_test['Age'].map(map_reg)

In [None]:
feats = ['Total_premium','Experience','Policy_premium','Age_limit','Prev_insured_vehicle_damage','Age_premium','Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Age', 'Vehicle_Age', 'Region_Code', 'Gender']

In [None]:
X_train, X_test, Y = df_train[feats].values, df_test[feats].values, df_train["Response"].values

In [None]:
kfold, scores = KFold(n_splits=5, shuffle=True, random_state=0), list()
for training, testing in kfold.split(X_train):
    x_train, x_test = X_train[training], X_train[testing]
    y_train, y_test = Y[training], Y[testing]
    
    model = LGBMClassifier(random_state=22)
    model.fit(X_train, Y, categorical_feature=[feats.index("Age_limit"),feats.index("Prev_insured_vehicle_damage"),feats.index("Gender"), feats.index("Region_Code"), feats.index("Policy_Sales_Channel")])
    preds = np.array(model.predict_proba(x_test))
    score = roc_auc_score(y_test, preds[:,-1])
    scores.append(score)
    print(score)
print("Average: ", sum(scores)/len(scores))

In [None]:
feat_imp = pd.Series(model.feature_importances_, index=df_train[feats].columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

In [None]:
#oof predictions lgbm
import numpy as np

i=0
setused=X_train
targ=Y

scores=[]
splits=7

cnf_matrix=[]
sc = 0
kfold, scores = KFold(n_splits=splits,random_state=True), list()
for train2, test2 in kfold.split(setused,targ):
    x_train, x_test = setused[train2], setused[test2]
    y_train, y_test = targ[train2], targ[test2]
    eval_set = [(x_test,y_test)]
    model = LGBMClassifier(random_state=22)
    model.fit(x_train, y_train, categorical_feature=[feats.index("Age_limit"),feats.index("Prev_insured_vehicle_damage"),feats.index("Gender"), feats.index("Region_Code"), feats.index("Policy_Sales_Channel")])
    preds = np.array(model.predict_proba(x_test))
    score = roc_auc_score(y_test, preds[:,-1])
    scores.append(score)
    print(score)    
    sc += 1
    if i == 0:
      oof_preds = model.predict_proba(X_test)
      i += 1
    else:
      oof_preds += model.predict_proba(X_test)
oof_preds = oof_preds/sc

print("Average: ", np.sum(scores)/len(scores))

df_submit = pd.DataFrame({'id':test_id,'Response': oof_preds[:,-1]})
df_submit.to_csv("submit_lgbm.csv", index=False)

In [None]:
model = LGBMClassifier(random_state=22)
model.fit(X_train, Y, categorical_feature=[feats.index("Age_limit"),feats.index("Prev_insured_vehicle_damage"),feats.index("Gender"), feats.index("Region_Code"), feats.index("Policy_Sales_Channel")])
preds = np.array(model.predict_proba(X_test))
df_submit = pd.DataFrame({'id':test_id,'Response': preds[:,-1]})
df_submit.to_csv("submit_cat.csv", index=False)