In [5]:
pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/90/86/c3dcb600b4f9e7584ed90ea9d30a717fb5c0111574675f442c3e7bc19535/catboost-0.24.1-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.1MB 45kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.1


In [13]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [55]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [15]:
df_train['Driving_License'] = "D_" + df_train['Driving_License'].astype(str)
df_train['Region_Code'] = "D_" + df_train['Region_Code'].astype(str)
df_train['Policy_Sales_Channel'] = "D_" + df_train['Policy_Sales_Channel'].astype(str)

df_test['Driving_License'] = "D_" + df_test['Driving_License'].astype(str)
df_test['Region_Code'] = "D_" + df_test['Region_Code'].astype(str)
df_test['Policy_Sales_Channel'] = "D_" + df_test['Policy_Sales_Channel'].astype(str)


mp_2 = {'Yes':0, 'No':1}
df_train['Vehicle_Damage'] = df_train['Vehicle_Damage'].map(mp_2)
df_test['Vehicle_Damage'] = df_test['Vehicle_Damage'].map(mp_2)

mp_3 = {'< 1 Year': 0, '1-2 Year':1, '> 2 Years':2}
df_train['Vehicle_Age'] = df_train['Vehicle_Age'].map(mp_3)
df_test['Vehicle_Age'] = df_test['Vehicle_Age'].map(mp_3)

In [56]:
test_id = df_test.id.values
train_id = df_train.id.values
df_train.drop(['id'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

In [58]:
df_train['Age_premium'] = df_train['Age']/df_train['Annual_Premium']
df_test['Age_premium'] = df_test['Age']/df_test['Annual_Premium']

df_train['Prev_insured_vehicle_damage'] = df_train['Previously_Insured'].astype(str) + "_" +  df_train['Vehicle_Damage'].astype(str)
df_test['Prev_insured_vehicle_damage'] = df_test['Previously_Insured'].astype(str) + "_" + df_test['Vehicle_Damage'].astype(str)

In [59]:
#age limit

df_train['Age_limit'] = [1 if i<18 or i>65 else 0 for i in df_train['Age'].values]
df_test['Age_limit'] = [1 if i<18 or i>65 else 0 for i in df_test['Age'].values]

In [69]:
feats = ['Age_limit','Prev_insured_vehicle_damage','Age_premium','Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Age', 'Vehicle_Age', 'Region_Code', 'Gender']

In [70]:
X_train, X_test, Y = df_train[feats].values, df_test[feats].values, df_train["Response"].values

In [None]:
kfold, scores = KFold(n_splits=5, shuffle=True, random_state=0), list()
for training, testing in kfold.split(X_train):
    x_train, x_test = X_train[training], X_train[testing]
    y_train, y_test = Y[training], Y[testing]
    
    model = CatBoostClassifier(random_state=27, verbose=500, task_type='GPU')
    model.fit(X_train, Y, cat_features=[feats.index("Age_limit"),feats.index("Prev_insured_vehicle_damage"),feats.index("Gender"), feats.index("Region_Code"), feats.index("Policy_Sales_Channel")])
    preds = np.array(model.predict_proba(x_test))

    score = roc_auc_score(y_test, preds[:,-1])
    scores.append(score)
    print(score)
print("Average: ", sum(scores)/len(scores))

In [None]:
feat_imp = pd.Series(model.feature_importances_, index=df_train[feats].columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

In [None]:

model = CatBoostClassifier(random_state=27, verbose=500)
model.fit(X_train, Y, cat_features=[feats.index("Age_limit"),feats.index("Prev_insured_vehicle_damage"),feats.index("Gender"), feats.index("Region_Code"), feats.index("Policy_Sales_Channel")])
preds = np.array(model.predict_proba(X_test))
df_submit = pd.DataFrame({'id':test_id,'Response': preds[:,-1]})
df_submit.to_csv("submit_cat.csv", index=False)