# Introduction

In [None]:
import warnings
warnings.filterwarnings('ignore')

import re
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib


from sklearn import model_selection
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

%matplotlib inline
sns.set(rc={'figure.figsize':(20.7,8.27)})
sns.set_style("whitegrid")
sns.color_palette("dark")
plt.style.use("fivethirtyeight")

# Load and Describe Data

## Load Data

In [None]:
url = '../input/health-insurance-cross-sell-prediction/train.csv'
df = pd.read_csv(url) 
df

## Data Description

In [None]:
df.info()

## Check Null and Missing Values

In [None]:
data_missing_value = df.isnull().sum().reset_index()
data_missing_value.columns = ['feature','missing_value']
data_missing_value['percentage'] = round((data_missing_value['missing_value']/len(df))*100,2)
data_missing_value = data_missing_value.sort_values('percentage', ascending=False).reset_index(drop=True)
data_missing_value = data_missing_value[data_missing_value['percentage']>0]
data_missing_value

## Numerical Data

In [None]:
numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
display(df.select_dtypes(include=numerics).columns)
print(df.select_dtypes(include=numerics).shape)
data_num = df.select_dtypes(include=numerics)
data_num.head(3)

## Non Numerical Data

In [None]:
display(df.select_dtypes(include=['object']).columns)
print(df.select_dtypes(include=object).shape)
data_cat = df.select_dtypes(include=['object'])
data_cat.head(3)

# Data Preparation

## Transform categorical variables

### Check Cardinality

In [None]:
col_cat = list(data_cat)
for i in col_cat:
    print(i, ' ---UNIQUE VALUE--  ',len(df[i].unique()))

## EDA

In [None]:
df[['id', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Response']].describe()

### Box Plot Numerical Feature

In [None]:
features = ['Age', 'Annual_Premium', 'Vintage']
plt.figure(figsize=(25, 7))
for i in range(0, len(features)):
    plt.subplot(1, 7, i+1)
    sns.boxplot(df[features[i]],color='green',orient='h')
    plt.tight_layout()

### Distribution Plot

In [None]:
df_num = df[features]
k = len(df_num.columns)
n = 2
m = (k - 1) // n + 1
fig, axes = plt.subplots(m, n, figsize=(n * 5, m * 3))
for i, (name, col) in enumerate(df_num.iteritems()):
    r, c = i // n, i % n
    ax = axes[r, c]
    col.hist(ax=ax, color='green')
    ax2 = col.plot.kde(ax=ax, secondary_y=True, title=name, color='red')
    ax2.set_ylim(0)


### Bar Plot Response

In [None]:
plt.figure(figsize=(5,6))
ax = sns.countplot(x='Response', data=df, palette='rocket')
for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()),
                    xy=(p.get_x() + p.get_width() / 2, p.get_height()),
                    xytext=(0, 3),fontsize=14,
                    textcoords="offset points",
                    ha='center', va='bottom')

### Distribution Plot Feature Age

In [None]:
sns.distplot(df['Age']);
plt.title('Distplot Age', fontsize = 32);

### Response by Age

In [None]:
gr_age = []
for i, kolom in df.iterrows():
    if kolom['Age'] >= 20 and kolom['Age'] <= 29:
        segment = '20s'
    elif kolom['Age'] >= 30 and kolom['Age'] <= 39:
        segment = '30s'
    elif kolom['Age'] >= 40 and kolom['Age'] <= 49:
        segment = '40s'
    elif kolom['Age'] >= 50 and kolom['Age'] <= 59:
        segment = '50s'
    elif kolom['Age'] >= 60 and kolom['Age'] <= 69:
        segment = '60s'
    elif kolom['Age'] >= 70 and kolom['Age'] <= 79:
        segment = '70s'
    else:
        segment = '80s'
    gr_age.append(segment)
    
df['Group_Age'] = gr_age
df.head()

In [None]:
plt.figure(figsize=(20,5))
dfs = df.sort_values('Group_Age', ascending=True)
ax = sns.countplot(x='Group_Age',hue='Response',data=dfs,palette='rocket')
for p in ax.patches:
  ax.annotate('{}'.format(p.get_height()),
              xy=(p.get_x() + p.get_width() / 2, p.get_height()),
              xytext=(0, 3),fontsize=14,
              textcoords="offset points",
              ha='center', va='bottom')
ax.set_title('Response by Age')
ax.set_xlabel('Age')
ax.set_ylabel('Response')

### Annual Premium

In [None]:
bins = np.percentile(df['Annual_Premium'],[0,30,90,100])
df['spender'] = pd.cut(df['Annual_Premium'],bins=bins,labels=['low', 'medium', 'high'], include_lowest=True)
df['spender'].hist()

### Gender

In [None]:
plt.figure(figsize=(15,5))
ax = sns.countplot(x='Gender', data=df, hue='Response',palette='rocket')
ax.set_title('Response Based on Gender', fontsize=15)
ax.set_xlabel('Gender', fontsize= 14)
ax.set_ylabel('')
plt.xticks(fontsize=15)
plt.tight_layout()
for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()),
                    xy=(p.get_x() + p.get_width() / 2, p.get_height()),
                    xytext=(0, 3),fontsize=14,
                    textcoords="offset points",
                    ha='center', va='bottom')

### Response Based on Vehicle Age

In [None]:
plt.figure(figsize=(15,8))
ax = sns.countplot(x='Vehicle_Age', data=df, hue='Response',palette='rocket')

ax.set_title('Response Based on Vehicle Age', fontsize=15)
ax.set_xlabel('Vehicle Age', fontsize= 14)
ax.set_ylabel('')
plt.xticks(fontsize=15)
plt.tight_layout()

### Region Code

In [None]:
ct = pd.crosstab(df['Region_Code'], df['Response']).reset_index()
ct = pd.DataFrame(data = ct.iloc[:,1:].values,index = ct.iloc[:,0:1].values.flatten(), columns = [0,1])
ct['count'] = ct[0] + ct[1]
ct2 = ct.sort_values('count', ascending=False).head(10)
ax = ct2[[0,1]].plot(kind='bar', rot=0, cmap='Accent')
x = 0
for p in ax.patches:
    for i in range(len(ct2['count'].values)):
        num = ct2['count'].values[x]
        x += 1
        if x > 9:
            x = 0
        break
    ax.annotate('{0:.1f}%'.format(p.get_height()*100/num),
                xy=(p.get_x() + p.get_width() / 2, p.get_height()),
                xytext=(0, 3),fontsize=11,
                textcoords="offset points",
                ha='center', va='bottom')
ax.set_title('Response Based on Region Code', fontsize=30)
ax.set_xlabel('Region Code', fontsize=20)
ax.set_ylabel('')

### Response Based on Vehicle Damage

In [None]:
plt.figure(figsize=(15,8))
ax = sns.countplot(x='Vehicle_Damage', data=df, hue='Response',palette='rocket')
ax.set_title('Response Based on Vehicle Damage', fontsize=25)
ax.set_xlabel('Vehicle Damage', fontsize=20)
ax.set_ylabel('')
plt.xticks(fontsize=15)
plt.tight_layout()

### Response Based on Driving License Feature

In [None]:
plt.figure(figsize=(15,8))
ax = sns.countplot(x='Driving_License', data=df, hue='Response',palette='rocket')
ax.set_title('Response Based on Driving License Feature', fontsize=25)
ax.set_xlabel('Driving License', fontsize=20)
ax.set_ylabel('')
plt.xticks(fontsize=15)
plt.tight_layout()

### Previously Injured

In [None]:
def percentage_hue(plot, feature, Number_of_categories, hue_categories):
    a = [p.get_height() for p in plot.patches]
    patch = [p for p in plot.patches]
    for i in range(Number_of_categories):
        total = feature.value_counts().values[i]
        for j in range(hue_categories):
            percentage = '{:.1f}%'.format(100 * a[(j*Number_of_categories + i)]/total)
            x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.1
            y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height() 
            plot.annotate(percentage, (x, y), size = 12, va='bottom')
    plt.show()

In [None]:
fig,ax = plt.subplots(1,2,figsize=(14,8))

ax1 = sns.countplot(x='Previously_Insured', data=df, palette='rocket', ax=ax[0])
for p in ax1.patches:
        ax1.annotate('{}'.format(p.get_height()),
                    xy=(p.get_x() + p.get_width() / 2, p.get_height()),
                    xytext=(0, 3),fontsize=14,
                    textcoords="offset points",
                    ha='center', va='bottom')

ax2 = sns.countplot(x='Previously_Insured', hue='Response', data=df, palette='rocket', ax=ax[1])
percentage_hue(ax2, df['Previously_Insured'], 2,2)

### Vintage

In [None]:
plt.figure(figsize=(12,7))
sns.distplot(df['Vintage'])

### Policy Salaes Channels

In [None]:
plt.figure(figsize=(18, 7))
df['Policy_Sales_Channel'].value_counts().head(10).plot.bar()

plt.title('Top 10 Policy Sales Channels', fontsize = 15)
plt.xticks(rotation = 90)
plt.show()

In [None]:
ct = pd.crosstab(df['Policy_Sales_Channel'], df['Response']).reset_index()
ct = pd.DataFrame(data = ct.iloc[:,1:].values,index = ct.iloc[:,0:1].values.flatten(), columns = [0,1])
ct['count'] = ct[0] + ct[1]
ct2 = ct.sort_values('count', ascending=False).head(10)
ax = ct2[[0,1]].plot(kind='bar', rot=0, cmap='Accent')
x = 0
for p in ax.patches:
    for i in range(len(ct2['count'].values)):
        num = ct2['count'].values[x]
        x += 1
        if x > 9:
            x = 0
        break
    ax.annotate('{0:.1f}%'.format(p.get_height()*100/num),
                xy=(p.get_x() + p.get_width() / 2, p.get_height()),
                xytext=(0, 3),fontsize=11,
                textcoords="offset points",
                ha='center', va='bottom')
ax.set_title('Response Based on Policy Sales Channel', fontsize=30)
ax.set_xlabel('Policy Sales Channel', fontsize=20)
ax.set_ylabel('')

## Feature Engineering

In [None]:
url = '../input/health-insurance-cross-sell-prediction/train.csv'
df = pd.read_csv(url) 
df

In [None]:
df_feat = df

### Grouping Feature "Age"

In [None]:
gr_age = []
for i, kolom in df_feat.iterrows():
    if kolom['Age'] >= 20 and kolom['Age'] <= 29:
        segment = 1
    elif kolom['Age'] >= 30 and kolom['Age'] <= 39:
        segment = 2
    elif kolom['Age'] >= 40 and kolom['Age'] <= 49:
        segment = 3
    elif kolom['Age'] >= 50 and kolom['Age'] <= 59:
        segment = 4
    elif kolom['Age'] >= 60 and kolom['Age'] <= 69:
        segment = 5
    elif kolom['Age'] >= 70 and kolom['Age'] <= 79:
        segment = 6
    else:
        segment = 7
    gr_age.append(segment)
    
df_feat['Group_Age'] = gr_age
df_feat.head()

### One Hot Encoding

In [None]:
df_feat = pd.get_dummies(df_feat, columns=['Gender'], drop_first=True)
df_feat = pd.get_dummies(df_feat, columns=['Vehicle_Damage'], drop_first=True)
df_feat = pd.get_dummies(df_feat, columns=['Vehicle_Age'], drop_first=True)

In [None]:
df_feat.head(10)

### Drop ID and Age

In [None]:
df_feat = df_feat.drop(['id', 'Age'], axis=1)

In [None]:
df_feat.head()

### Split

In [None]:
x = df_feat.drop(['Response'],axis=1)
y = df_feat['Response']

### Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
df_scaled = scaler.fit_transform(df_feat.drop('Response', axis=1))

In [None]:
X = df_scaled
y = df_feat['Response']

### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3, random_state=101)


## Modeling

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report, plot_confusion_matrix,precision_score

In [None]:
def plot_ROC(fpr, tpr, m_name):
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6, 6))
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)
    
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)
    
    plt.xlim([-0.1, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.grid(True)
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.title('Receiver operating characteristic for %s'%m_name, fontsize=20)
    plt.legend(loc="lower right", fontsize=16)
    plt.show()

### Logistik Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

In [None]:
LogPred = logmodel.predict(X_test)

In [None]:
LogPredProb = logmodel.predict_proba(X_test)[:,1]

In [None]:
(fpr, tpr, thresholds) = roc_curve(y_test, LogPredProb)
plot_ROC(fpr, tpr,'Logistic Regression')

In [None]:
plot_confusion_matrix(logmodel, X_test, y_test, cmap=plt.cm.Blues)
plt.grid(False)
plt.show()

In [None]:
print(classification_report(y_test,LogPred))

#### CV Logistik Regression

In [None]:
scores = cross_val_score(logmodel, X_train, y_train, scoring='roc_auc', cv=10)
print('Cross-Validation ROC_AUC Scores', scores)

In [None]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train, y_train)

In [None]:
dtreePred = dtree.predict(X_test)

In [None]:
dtreePredProb = dtree.predict_proba(X_test)[:,1]

In [None]:
(fpr, tpr, thresholds) = roc_curve(y_test, dtreePredProb)
plot_ROC(fpr, tpr,'Decision Tree')

In [None]:
plot_confusion_matrix(dtree, X_test, y_test, cmap=plt.cm.Blues)
plt.grid(False)
plt.show()

In [None]:
print(classification_report(y_test,LogPred))

#### CV Decision Tree

In [None]:
scores = cross_val_score(dtree, X_train, y_train, scoring='roc_auc',cv=10)
print('Cross-Validation ROC_AUC Scores', scores)

In [None]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
Rtree = RandomForestClassifier(n_estimators=300)

In [None]:
Rtree.fit(X_train, y_train)

In [None]:
Rtreepred = Rtree.predict(X_test)

In [None]:
RtreepredProb =  Rtree.predict_proba(X_test)[:,1]

In [None]:
(fpr, tpr, thresholds) = roc_curve(y_test, RtreepredProb)
plot_ROC(fpr, tpr,'Random Forest')

In [None]:
plot_confusion_matrix(Rtree, X_test, y_test, cmap=plt.cm.Blues)
plt.grid(False)
plt.show()

In [None]:
print(classification_report(y_test,Rtreepred))

#### CV Random Forest

In [None]:
scores = cross_val_score(Rtree, X_train, y_train, scoring='roc_auc', cv=10)
print('Cross-Validation ROC_AUC Scores', scores)

In [None]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

### XGBOOTS MODEL

In [None]:
xgb_model=xgb.XGBClassifier()
xgb_model.fit(X_train,y_train)
#change the hyperparams

In [None]:
XGBpred = xgb_model.predict_proba(X_test)[:,1]

In [None]:
(fpr, tpr, thresholds) = roc_curve(y_test, XGBpred)
plot_ROC(fpr, tpr,'XGBoost')

In [None]:
plot_confusion_matrix(xgb_model, X_test, y_test, cmap=plt.cm.Blues)
plt.grid(False)
plt.show()


In [None]:
print(classification_report(y_test,XGBpred>0.5))

#### CV XGBOOST

In [None]:
scores = cross_val_score(xgb_model, X_train, y_train,scoring='roc_auc', cv=10)
print('Cross-Validation ROC_AUC Scores', scores)

In [None]:
scores = pd.Series(scores)
scores.min(), scores.mean(), scores.max()

# Tuning HyperParameter XGBoost

In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
space={ 'max_depth': hp.quniform("max_depth", 3,18,1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 300,
        'seed': 0
      }

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    y_score = clf.predict_proba(X_test)[:,1]
    accuracy = accuracy_score(y_test, pred>0.5)
    Roc_Auc_Score = roc_auc_score(y_test, y_score)
    print ("ROC-AUC Score:",Roc_Auc_Score)
    print ("SCORE:", accuracy)
    return {'loss': -Roc_Auc_Score, 'status': STATUS_OK }

In [None]:
trials = Trials()

In [None]:
# best_hyperparams = fmin(fn = objective,
#                         space = space,
#                         algo = tpe.suggest,
#                         max_evals = 100,
#                         trials = trials)

In [None]:
# print("The best hyperparameters are : ","\n")
# print(best_hyperparams)

In [None]:
xgb_model=xgb.XGBClassifier(n_estimators = space['n_estimators'], max_depth = 9, gamma = 1.6331807156755782, reg_lambda = 0.46569712565971155,
                            reg_alpha = 40.0, min_child_weight=2.0,colsample_bytree = 0.8255017098966712)
xgb_model.fit(X_train,y_train)

In [None]:
XGBpred = xgb_model.predict_proba(X_test)[:,1]

In [None]:
(fpr, tpr, thresholds) = roc_curve(y_test, XGBpred)
plot_ROC(fpr, tpr,'XGBoost')

In [None]:
plot_confusion_matrix(xgb_model, X_test, y_test, cmap=plt.cm.Blues)
plt.grid(False)
plt.show()

In [None]:
print(classification_report(y_test,XGBpred>0.5))

# Test Case

In [None]:
test = pd.read_csv('../input/health-insurance-cross-sell-prediction/test.csv') 
test

In [None]:
gr_age = []
for i, kolom in test.iterrows():
    if kolom['Age'] >= 20 and kolom['Age'] <= 29:
        segment = 1
    elif kolom['Age'] >= 30 and kolom['Age'] <= 39:
        segment = 2
    elif kolom['Age'] >= 40 and kolom['Age'] <= 49:
        segment = 3
    elif kolom['Age'] >= 50 and kolom['Age'] <= 59:
        segment = 4
    elif kolom['Age'] >= 60 and kolom['Age'] <= 69:
        segment = 5
    elif kolom['Age'] >= 70 and kolom['Age'] <= 79:
        segment = 6
    else:
        segment = 7
    gr_age.append(segment)
    
test['Group_Age'] = gr_age
test.head()

In [None]:
test = pd.get_dummies(test, columns=['Gender'], drop_first=True)
test = pd.get_dummies(test, columns=['Vehicle_Damage'], drop_first=True)
test = pd.get_dummies(test, columns=['Vehicle_Age'], drop_first=True)

In [None]:
test_id = test['id']

In [None]:
test = test.drop(['id', 'Age'], axis=1)

In [None]:
test_scaled = scaler.fit_transform(test)

## XGBoost Prediction Output

In [None]:
test_xgb_proba = [pred[1] for pred in xgb_model.predict_proba(test_scaled)]

In [None]:
submission_xgb = pd.DataFrame(data = {'id': test_id, 'Response': test_xgb_proba})

In [None]:
submission_xgb.to_csv('submission.csv', index = False)
submission_xgb.head()