In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import sklearn
import matplotlib.pyplot as plt
from sklearn import metrics
import scikitplot as skplt
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 

In [None]:
data = pd.read_csv("../input/health-insurance-cross-sell-prediction/train.csv")
test = pd.read_csv("../input/health-insurance-cross-sell-prediction/test.csv")

Checked for missing values

In [None]:
print(test.isnull().sum())

In [None]:
print(data.isnull().sum())

In [None]:
print(data['Driving_License'].value_counts())

There are a very small number of people who don't have driving Licence. So we should drop this feature.

In [None]:
data = data.drop(columns = ['Driving_License'], axis = 1)
test = test.drop(columns = ['Driving_License'], axis = 1)

In [None]:
data['Previously_Insured'].value_counts()

In [None]:
pd.crosstab(data['Response'], data['Previously_Insured'])

In [None]:
le = LabelEncoder()
data['Gender'] = le.fit_transform(data["Gender"])
data['Vehicle_Damage'] = le.fit_transform(data["Vehicle_Damage"])
data.head()

We will scale the columns Age, Annual_Premium and Vinatge as they have vary much.

In [None]:
num = ['Age', 'Vintage']
ss = StandardScaler()
data[num] = ss.fit_transform(data[num])
mm = MinMaxScaler()
data[['Annual_Premium']] = mm.fit_transform(data[['Annual_Premium']])
test[num] = ss.fit_transform(test[num])
test[['Annual_Premium']] = mm.fit_transform(test[['Annual_Premium']])

In [None]:
data.head()

Converted Vehicle_age to categorical features.

In [None]:
ohe = pd.get_dummies(data['Vehicle_Age'], prefix='Vehicle_Age')
data = pd.concat([data, ohe], axis=1)

In [None]:
data = data.drop(columns = ['id', 'Vehicle_Age'], axis = 1)
data.head()

In [None]:
y = data['Response']
data = data.drop(columns = ['Response'], axis= 1)

In [None]:
ohe1 = pd.get_dummies(test['Vehicle_Age'], prefix='Vehicle_Age')
test = pd.concat([test, ohe1], axis=1)
id = test['id']
test = test.drop(columns = ['id', 'Vehicle_Age'], axis = 1)
test['Gender'] = le.fit_transform(test["Gender"])
test['Vehicle_Damage'] = le.fit_transform(test["Vehicle_Damage"])
test.head()

In [None]:
data.head()

Splitting of data into train and valid

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(data, y, train_size=0.8, random_state = 5)

In [None]:
def plot_ROC(fpr, tpr, m_name):
    roc_auc = sklearn.metrics.auc(fpr, tpr)
    plt.figure(figsize=(6, 6))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)
    
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.grid(True)
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title('Receiver operating characteristic for %s'%m_name, fontsize=20)
    plt.legend(loc="lower right", fontsize=16)
    plt.show()

**RANDOM FOREST CLASSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X = X_train, y = y_train)
acc = rf.score(X_valid, y_valid)
print("Accuracy of Random_Forest: ",acc)

In [None]:
rf_preds = rf.predict_proba(X_valid)
rf_score = roc_auc_score(y_valid, rf_preds[:,1], average = 'weighted')
(fpr, tpr, thresholds) = roc_curve(y_valid, rf_preds[:,1])
plot_ROC(fpr, tpr, 'rf')
rf_class = rf.predict(X_valid)
print('ROC AUC score for rf model: %.4f'%rf_score)
print('F1 score: %0.4f'%f1_score(y_valid, rf_class))
skplt.metrics.plot_confusion_matrix(y_valid, rf_class,
        figsize=(8,8))

**GRADIENT BOOST CLASSIFIER**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier(n_estimators=134,learning_rate=0.2)
gb.fit(X_train, y_train)
accuracy2 = gb.score(X_valid, y_valid)
print("Accuracy of Gradient Boost", accuracy2)

In [None]:
gb_preds = gb.predict_proba(X_valid)
gb_score = roc_auc_score(y_valid, gb_preds[:,1], average = 'weighted')
print(gb_score)
(fpr, tpr, thresholds) = roc_curve(y_valid, gb_preds[:,1])
plot_ROC(fpr, tpr, 'gb')

In [None]:
gb_class = gb.predict(X_valid)
print('ROC AUC score for gb model: %.4f'%gb_score)
print('F1 score: %0.4f'%f1_score(y_valid, gb_class))
skplt.metrics.plot_confusion_matrix(y_valid, gb_class,
        figsize=(8,8))

**LGBM CLASSIFIER**

In [None]:
from lightgbm import LGBMClassifier
LGB_model = LGBMClassifier(random_state = 5, max_depth = 8, n_estimators = 300, reg_lambda = 1.2, reg_alpha = 1.2, min_child_weight = 1, verbose  = 1,
                       learning_rate = 0.15, gamma = 0.3, colsample_bytree = 0.5, eval_metric = 'auc', is_higher_better = 1, plot = True)
LGB_model.fit(X_train, y_train)
accuracy3 = LGB_model.score(X_valid, y_valid)
print("Accuracy of lgb: ", accuracy3)

In [None]:
LGB_preds = LGB_model.predict_proba(X_valid)
LGB_class = LGB_model.predict(X_valid)
LGB_score = roc_auc_score(y_valid, LGB_preds[:,1], average = 'weighted')
(fpr, tpr, thresholds) = roc_curve(y_valid, LGB_preds[:,1])
plot_ROC(fpr, tpr, 'LGBM')

In [None]:
print('ROC AUC score for LGBM model: %.4f'%LGB_score)
print('F1 score: %0.4f'%f1_score(y_valid, LGB_class))
skplt.metrics.plot_confusion_matrix(y_valid, LGB_class,
        figsize=(8,8))

ROC_AUC score of LGBClassifier is 0.8569 which is the best among the three. So we will take the predictions from LGBClassifier.

In [None]:
predictions = [pred[1] for pred in LGB_model.predict_proba(test)]
submission = pd.DataFrame(data = {'id': id, 'Response': predictions})
submission.to_csv('vehicle_insurance_lgb.csv', index = False)
submission.head()