In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
sns.countplot(train.Response);

## Gender

In [None]:
sns.countplot(train.Gender)

In [None]:
df=train.groupby(['Gender','Response'])['id'].count().to_frame().rename(columns={'id':'count'}).reset_index()
df

In [None]:
sns.catplot(x="Gender", y="count",col="Response",
                data=df, kind="bar");

## Age

In [None]:
sns.distplot(train.Age);

In [None]:
train.Age.describe()

In [None]:
sns.jointplot(x='Age',y='Annual_Premium',data=train,kind='scatter');

In [None]:
train.Annual_Premium.describe()

In [None]:
sns.distplot(train.Annual_Premium);

In [None]:
sns.countplot(train.Previously_Insured);

In [None]:
sns.countplot(train.Vehicle_Age);

In [None]:
pd.crosstab(train['Driving_License'],train['Response']).plot(kind='bar');

In [None]:
pd.crosstab(train.Vehicle_Damage,train.Response).plot(kind='bar');

## Working with features

In [None]:
train.head()

In [None]:
train = train.drop('id',axis=1)

In [None]:
train['Gender'] = train['Gender'].map({'Male':0,'Female':1}).astype(int)

In [None]:
train=pd.get_dummies(train,drop_first=True)
train=train.rename(columns={"Vehicle_Age_< 1 Year": "Vehicle_Age_lt_1_Year", "Vehicle_Age_> 2 Years": "Vehicle_Age_gt_2_Years"})
train['Vehicle_Age_lt_1_Year']=train['Vehicle_Age_lt_1_Year'].astype('int')
train['Vehicle_Age_gt_2_Years']=train['Vehicle_Age_gt_2_Years'].astype('int')
train['Vehicle_Damage_Yes']=train['Vehicle_Damage_Yes'].astype('int')

In [None]:
train.head()

In [None]:
target = train['Response']
train = train.drop('Response',axis=1)

In [None]:
x = train.copy()

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(x,target,test_size=0.3, random_state=17)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [None]:
logit = LogisticRegression(random_state=17)
logit.fit(X_train_scaled, y_train)
prediction = logit.predict_proba(X_valid_scaled)[:,1]

In [None]:
roc_auc_score(y_valid,prediction)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train_scaled,y_train)
prediction = rfc.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid,prediction)

In [None]:
test.head()

In [None]:
test = test.drop('id',axis=1)
test['Gender'] = test['Gender'].map({'Male':0,'Female':1}).astype(int)
test=pd.get_dummies(test,drop_first=True)
test=test.rename(columns={"Vehicle_Age_< 1 Year": "Vehicle_Age_lt_1_Year", "Vehicle_Age_> 2 Years": "Vehicle_Age_gt_2_Years"})
test['Vehicle_Age_lt_1_Year']=test['Vehicle_Age_lt_1_Year'].astype('int')
test['Vehicle_Age_gt_2_Years']=test['Vehicle_Age_gt_2_Years'].astype('int')
test['Vehicle_Damage_Yes']=test['Vehicle_Damage_Yes'].astype('int')

In [None]:
X_train_scaled = scaler.fit_transform(train)
X_test_scaled = scaler.fit_transform(test)

In [None]:
rfs = RandomForestClassifier(random_state=17)
param_grid = {'n_estimators':[100,150,200],
             'criterion':['gini','entropy'],
             'bootstrap':[True],
             'max_depth':[15,20,25,30],
             'max_features':['auto','sqrt',10],
             'min_samples_leaf':[2,3],
             'min_samples_split':[2,3]}
clf_rfs = RandomizedSearchCV(rfs,param_distributions=param_grid,cv=5,verbose=True,n_jobs=-1)
best_clf_rfs = clf_rfs.fit(X_train_scaled,target)

In [None]:
best_clf_rfs.best_params_

In [None]:
best_clf_rfs.best_score_

In [None]:
predictions = best_clf_rfs.predict(X_test_scaled).astype(int)

In [None]:
output_test = pd.read_csv('data/test.csv')
output = pd.DataFrame({'id':output_test.id,'Response':predictions})
output.to_csv('submission.csv',index=False)

In [None]:
output.head()