In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
train_raw=pd.read_csv('train.csv')
test_raw=pd.read_csv('test.csv')

In [None]:
train_raw.head()

In [None]:
test_raw.head()

In [None]:
test_raw1 = test_raw

In [None]:
###test_raw1=pd.merge(test_raw, sample_sub[["id", "Response"]], on="id", how="left")

In [None]:
test_raw1.head()

In [None]:
train_raw.info()

In [None]:
test_raw1.info()

In [None]:
train_raw.isna().sum()

In [None]:
test_raw1.isna().sum()

In [None]:
train_raw.isnull().sum()

In [None]:
test_raw1.isnull().sum()

Dataset already clean


In [None]:
test_raw1.nunique()

In [None]:
train_raw.describe(include=['object'])

In [None]:
train_raw.describe()

# Visualization

In [None]:
plt.figure(figsize = (16,5))
sns.heatmap( train_raw.corr(),cmap='coolwarm',annot=True)

In [None]:
corr_data=train_raw.drop(['id'], inplace=False, axis=1)

In [None]:
plt.figure(figsize=(19, 17))
matrix = np.triu(corr_data.corr())
sns.heatmap(corr_data.corr(), annot=True, linewidth=.8, mask=matrix, cmap="rocket");

In [None]:
fig = px.histogram(train_raw, x="Response", color="Gender", marginal="box", # can be `box`, `violin`,'rug'
                         hover_data=train_raw.columns,title='Gender & Response Correlation')
fig.update_layout(bargap=0.1)
fig.show()

Gender didnt perform that much role with the Response

In [None]:
print(train_raw['Gender'].value_counts())

In [None]:
fig = px.histogram(train_raw, x="Age", color="Response", marginal="box", # can be `box`, `violin`,'rug'
                         hover_data=train_raw.columns,title='Age & Response Correlation')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
sns.catplot(x="Response",y="Age", data=train_raw, kind='box')
plt.title("Age and Response Correlation", size=20, y=1.0);

Age>35 got a little more interesting than Age < 35

In [None]:
fig=px.histogram(train_raw,
                x='Response',
                marginal='box',
                color='Driving_License',
                color_discrete_sequence=['green', 'blue'],
                title='Driving License & Response Correlation'
                )
fig.update_layout(bargap=0.05)
fig.show()

Driving license also didnt show that much

In [None]:
sns.catplot(x="Response", y="Region_Code", data=train_raw, kind="box")
plt.title("Region code & Response Correlation", size=20, y=1.0);

Less correlation between region & response.

In [None]:
fig=px.histogram(train_raw,
                x='Response',
                marginal='box',
                color='Previously_Insured',
                color_discrete_sequence=['green', 'blue'],
                title='Previously_Insured & Response Correlation'
                )
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig=px.histogram(train_raw,
                x='Response',
                marginal='box',
                color='Vehicle_Damage',
                color_discrete_sequence=['green', 'blue'],
                title='Vehicle_Damage and Response Correlation'
                )
fig.update_layout(bargap=0.1)
fig.show()

people that got vehicle damage got more response 

In [None]:
sns.countplot(data=train_raw,x='Vehicle_Age',hue='Vehicle_Damage');

Vehicle that age around 1-2 years  got the highest vehicle damage

# Data Preprocessing

In [None]:
#drop unuse column
train_raw.drop("id", inplace=True, axis=1)
test_raw1.drop("id",inplace=True, axis=1)

Replace data into numeric

In [None]:
train_raw['Gender']=train_raw['Gender'].replace({'Male':1,'Female':0})
train_raw.head()

In [None]:
train_raw['Vehicle_Damage'].replace({'Yes':1,'No':0}, inplace=True)
train_raw['Vehicle_Age'].replace({'< 1 Year':1,'1-2 Year':2,'> 2 Years':3}, inplace=True)
train_raw.head()

In [None]:
train_df=train_raw

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(train_df.corr(), annot=True)

In [None]:
hig_corr = train_df.corr()
hig_corr_features = hig_corr.index[abs(hig_corr["Response"]) >= 0.2]
hig_corr_features

In [None]:
test_raw1['Gender']=test_raw1['Gender'].replace({'Male':1,'Female':0})

In [None]:
test_raw1['Vehicle_Damage'].replace({'Yes':1,'No':0}, inplace=True)
test_raw1['Vehicle_Age'].replace({'< 1 Year':1,'1-2 Year':2,'> 2 Years':3}, inplace=True)

In [None]:
test_df=test_raw1
test_df.head()

#Scaling Numeric Data
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
numeric_cols=['Age','Region_Code','Annual_Premium','Policy_Sales_Channel','Vintage']

In [None]:
scaler=MinMaxScaler()

In [None]:
scaler.fit(train_df[numeric_cols])

In [None]:
train_df[numeric_cols]=scaler.transform(train_df[numeric_cols])
test_df[numeric_cols]=scaler.transform(test_df[numeric_cols])

In [None]:
train_df.describe().loc[['min', 'max']]

In [None]:
test_df.describe().loc[['min', 'max']]

In [None]:
train_df.head()

In [None]:
test_df.head()

# Training, Validation and Test Data

In [None]:
input_cols=list(train_df.columns)[0:-1]
target_col='Response'

In [None]:
input_cols

In [None]:
inputs=train_df[input_cols]
targets=train_df[target_col]

Split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_inputs, val_inputs, train_targets,val_targets=train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [None]:
val_inputs.head()

In [None]:
len(val_inputs)

In [None]:
len(val_targets)

In [None]:
len(train_inputs)

Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [None]:
model=LogisticRegression(solver='liblinear')

In [None]:
model.fit(train_inputs, train_targets)

In [None]:
%%time
train_preds=model.predict(train_inputs)
train_probs=model.predict_proba(train_inputs)
accuracy_score(train_targets, train_preds)

In [None]:
from sklearn.metrics import confusion_matrix
def predict_and_plot(inputs, targets,name=''):
    preds=model.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));    
    return preds

In [None]:
%%time
train_preds=predict_and_plot(train_inputs, train_targets,'Train')

In [None]:
%%time
val_preds=predict_and_plot(val_inputs, val_targets,'Validation')

Logistic Regression Model
Training Accuracy - 87.79%
Validation Accuracy - 87.50%

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_2=RandomForestClassifier(n_jobs=-1, random_state=42)

In [None]:
model_2.fit(train_inputs, train_targets)

In [None]:
%%time
model_2.score(train_inputs, train_targets)

In [None]:
from sklearn.metrics import confusion_matrix
def predict_and_plot_2(inputs, targets,name=''):
    preds=model_2.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));    
    return preds

In [None]:
%%time
val_preds_2 = predict_and_plot_2(val_inputs, val_targets, 'Validation')

Random forest without hyperparameter turning 
(Training Accuracy  99.98%
Validation Accuracy  86.50%)

Random forest without hyperparameter turning seem to be overfit from training accuracy 99.98%

Hyperparameter Tuning of Random Forest

In [None]:
model_2.feature_importances_

In [None]:
importance_df=pd.DataFrame({
    'feature':train_inputs.columns,
    'importance':model_2.feature_importances_
}).sort_values('importance',ascending=False)

In [None]:
importance_df

n_estimators hyperparameter

#10

In [None]:
model_2_1_1 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 10)

In [None]:
model_2_1_1.fit(train_inputs,train_targets)

In [None]:
%%time
model_2_1_1.score(train_inputs, train_targets)

In [None]:
%%time
model_2_1_1.score(val_inputs, val_targets)

#20

In [None]:
%%time
model_2_1_2 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 20)
model_2_1_2.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_1_2.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_1_2.score(val_inputs, val_targets)) 

#100

In [None]:
%%time
model_2_1_3 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 100)
model_2_1_3.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_1_3.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_1_3.score(val_inputs, val_targets))

In [None]:
#200

In [None]:
%%time
model_2_1_4 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 200)
model_2_1_4.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_1_4.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_1_4.score(val_inputs, val_targets))

In [None]:
#150

%%time
model_2_1_5 = RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators = 150)
model_2_1_5.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_1_5.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_1_5.score(val_inputs, val_targets))

n = 100 got the highest validation accuracy

max__features hyperparameter test

In [None]:
%%time
model_2_2_1 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 2)
model_2_2_1.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_1.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_1.score(val_inputs, val_targets)) 

In [None]:
%%time
model_2_2_2 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 5)
model_2_2_2.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_2.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_2.score(val_inputs, val_targets))  

In [None]:
%%time
model_2_2_3 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 10)
model_2_2_3.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_3.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_3.score(val_inputs, val_targets))  

In [None]:
%%time
model_2_2_4 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 6)
model_2_2_4.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_4.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_4.score(val_inputs, val_targets)) 

In [None]:
%%time
model_2_2_5 = RandomForestClassifier(random_state=42,n_jobs=-1,max_features = 7)
model_2_2_5.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_2_5.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_2_5.score(val_inputs, val_targets))

max_features = 6 is the best one

#Using n=100  feature=6

In [None]:
model_2_all = RandomForestClassifier(n_jobs=-1, random_state=42, n_estimators=100, max_features=6)

In [None]:
model_2_all.fit(train_inputs,train_targets)
print("Training Accuracy = ",model_2_all.score(train_inputs, train_targets))
print("Validation Accuracy = ",model_2_all.score(val_inputs, val_targets)) 

compare to non hyperparameter tuning its not much different

In [None]:
##Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_3=DecisionTreeClassifier(random_state=42)
model_3.fit(train_inputs, train_targets)

In [None]:
%%time
train_preds_3=model_3.predict(train_inputs)
train_probs_3=model_3.predict_proba(train_inputs)
accuracy_score(train_targets, train_preds_3)

In [None]:
def predict_and_plot_3(inputs, targets, name=''):
    preds = model_3.predict(inputs)
    accuracy = accuracy_score(targets, preds)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    cf = confusion_matrix(targets, preds, normalize='true')
    plt.figure()
    sns.heatmap(cf, annot=True)
    plt.xlabel('Prediction')
    plt.ylabel('Target')
    plt.title('{} Confusion Matrix'.format(name));    
    return preds

In [None]:
%%time
val_preds = predict_and_plot_2(val_inputs, val_targets, 'Validation')

##_Decision tree with out parameter tuning
Training Accuracy - 99.98%
Validation Accuracy - 86.50%

In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(30,10))
plot_tree(model_3, feature_names=train_inputs.columns, max_depth=3, filled=True);

In [None]:
model_3_text=export_text(model_3, feature_names=list(train_inputs.columns))
print(model_3_text[:3000])

In [None]:
model_3.feature_importances_

In [None]:
importance_df = pd.DataFrame({
    'feature': train_inputs.columns,
    'importance': model_3.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df

In [None]:
#Hyperparameter Tuning of random forest

***max_leaf_nodes***

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params_grid = {'max_leaf_nodes':[5,10,15,20,25]}

In [None]:
grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),cv=5,param_grid=params_grid)

In [None]:
grid.fit(train_inputs,train_targets)

In [None]:
grid.best_params_

In [None]:
model_3_1=grid.best_estimator_


In [None]:
model_3_1.score(train_inputs, train_targets)

In [None]:
model_3_1.score(val_inputs, val_targets)

***max_depth***

In [None]:
params_grid = {'max_depth':[10,20,30,40,50]}

In [None]:
grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),cv=5,param_grid=params_grid)

In [None]:
grid.fit(train_inputs,train_targets)

In [None]:
grid.best_params_

In [None]:
model_3_2=grid.best_estimator_

In [None]:
model_3_2.score(train_inputs, train_targets)

In [None]:
model_3_2.score(val_inputs, val_targets)

***criterion and splitter hyperparameter***

In [None]:
params_grid = {'criterion':['gini','entropy'],
               'splitter':['best','random'] }

In [None]:
grid = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),cv=5,param_grid=params_grid)

In [None]:
grid.fit(train_inputs,train_targets)

In [None]:
grid.best_params_

In [None]:
model_3_3=grid.best_estimator_

In [None]:
model_3_3.score(train_inputs, train_targets)

In [None]:
model_3_3.score(val_inputs, val_targets)

***#combine all the best parameter for decisiontree***

In [None]:
model_3_all = DecisionTreeClassifier(random_state=42,
                                       criterion='entropy',
                                       splitter='best',
                                       max_leaf_nodes=5,
                                       max_depth=10)

In [None]:
model_3_all.fit(train_inputs, train_targets)

In [None]:
model_3_all.score(train_inputs, train_targets)

In [None]:
model_3_all.score(val_inputs, val_targets)

##Decision Tree with hyperparameter tuning:
Training Accuracy - 87.804%
Validation Accuracy - 87.506%

The training accuracy of decision tree with hyperparameter tuning was decreased but validation got improved to be the highest accuracy from all of the model