# #Reference Link
* https://www.kaggle.com/yashvi/vehicle-insurance-eda-and-boosting-models
* https://www.kaggle.com/roshankumarg/rank-10-solution-cross-sell-prediction-hackathon
* https://www.kaggle.com/isaienkov/insurance-prediction-eda-and-modeling-acc-88
* https://towardsdatascience.com/boost-your-models-performance-with-these-fantastic-libraries-8dc10579b7ff
* https://stats.stackexchange.com/questions/421582/how-to-identify-a-case-of-overfitting-using-stratified-k-fold-cross-validation
* https://towardsdatascience.com/how-to-train-test-split-kfold-vs-stratifiedkfold-281767b93869
* https://www.analyticsvidhya.com/blog/2020/10/getting-started-with-feature-engineering/
* https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
* https://wizardforcel.gitbooks.io/tensorflow-examples-aymericdamien/content/

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# for visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# import holoviews as hv

# Testing
import scipy
import scipy.stats as st

# Modeling
import xgboost
from xgboost import XGBClassifier

import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn import model_selection
# Splitting Dataset
from sklearn.model_selection import train_test_split, cross_val_score

# Scoring
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import KFold

# Class Imbalance
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# from imblearn import over_sampling

%matplotlib inline

In [None]:
print('Numpy Version : ' + np.__version__)
print('Pandas Version : ' + pd.__version__)
print('Matplotlib Version : ' + matplotlib.__version__)
print('Seaborn Version : ' + sns.__version__)
print('Scipy Version : ' + scipy.__version__)
print('Sklearn Version : ' + sklearn.__version__)
print('XGBoost Version : ' + xgboost.__version__)

# 1. Data Description

![image.png](attachment:image.png)

# 2. Data Collection

In [None]:
train = pd.read_csv('../input/health-insurance-cross-sell-prediction/train.csv')
train.head()

In [None]:
train[['id','Region_Code']].groupby('Region_Code', as_index=False).count().sort_values('id', ascending=False).head(10).Region_Code.to_list()

In [None]:
train[train.Region_Code.isin(train[['id','Region_Code']].groupby('Region_Code', as_index=False).count().sort_values('id').head(10).Region_Code.to_list())]

In [None]:
# test = pd.read_csv(groupbyive/test.csv')
# test.head()

Target Output : Feature <strong>RESPONSE</strong>

# 3. Data Understanding

## 3.1. General Information

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe()

## 3.2. Numerical Data

In [None]:
numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
display(train.select_dtypes(include=numerics).columns)
print(train.select_dtypes(include=numerics).shape)
data_num = train.select_dtypes(include=numerics)

## 3.3. Non-Numerical data

In [None]:
#Invalid Value
display(train.select_dtypes(include=['object']).columns)
print(train.select_dtypes(include=object).shape)
data_cat = train.select_dtypes(include=['object'])

# 4. Exploratory Data Analyst

## 4.1. Statistika Deskriptif

### 4.1.1. Numerical Data

In [None]:
train[['id', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Response']].describe()

### 4.1.2. Non-Numerical Data

In [None]:
train['Gender'].value_counts()

In [None]:
train['Vehicle_Age'].value_counts()

In [None]:
train['Vehicle_Damage'].value_counts()

In [None]:
train[['Gender', 'Vehicle_Age', 'Vehicle_Damage']].describe()

## 4.2. Grouping Feature

In [None]:
trainGroup = train.loc[:, train.columns.intersection(['id', 'Response'])]
trainGroup['Age-Group'] = train['Age'].apply(lambda x : '> 50' if x > 50 else ('36 - 50' if (x > 35) and (x < 51) else '20-35'))
trainGroup['Vintage-Group'] = train['Vintage'].apply(lambda x : '0-100' if x < 100 else ('100 - 200' if (x > 100) and (x < 200) else '200 - 300'))
trainGroup['Annual_Premium-Group'] = train['Annual_Premium'].apply(lambda x : '> 450K' if x > 450000 else ('150K - 450K' if (x > 150000) and (x < 450001) else '0 - 150K'))

trainGroup

## 4.3. Univariate Analysis

### 4.3.1. Numerical Data

#### 4.3.1.1. Visualize Driving License

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,8),
                     sharey=True)


# Count Length Data Vehicle Damage
dv_len = len(train['Driving_License'])

g = sns.countplot(train['Driving_License'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
h = sns.countplot(train['Driving_License'], hue = train['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")

plt.suptitle('Response to Driving License',y=1, fontsize=24,color='dodgerblue',fontweight='bold')

fig.tight_layout()


plt.savefig('./driving-license.jpg')
plt.show();

#### 4.3.1.2. Visualize Previously Insured

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,8),
                     sharey=True)

pi_len = len(train['Previously_Insured'])

g = sns.countplot(train['Previously_Insured'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
h = sns.countplot(train['Previously_Insured'], hue = train['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")

plt.suptitle('Response to Previously Insured',y=1, fontsize=24,color='dodgerblue',fontweight='bold')

fig.tight_layout()

plt.savefig('./previously-insured.jpg')
plt.show();

#### 4.3.1.3. Visualize Age

In [None]:
fig,ax = plt.subplots(1,4,figsize=(26,8))

ag_len = len(trainGroup['Age-Group'])

g = sns.countplot(trainGroup['Age-Group'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")

h = sns.countplot(trainGroup['Age-Group'], hue = trainGroup['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
ax[1].legend(loc="upper left", title="Response",)

    
k = sns.distplot(train['Age'], ax=ax[2])
l = sns.boxplot(train['Age'], orient='v', ax=ax[3])

plt.suptitle('Distribution Age',y=1, fontsize=24,color='dodgerblue',fontweight='bold');

fig.tight_layout()

plt.savefig('./age.jpg')
plt.show();

#### 4.3.1.4. Visualize Vintage

In [None]:
fig,ax = plt.subplots(1,4,figsize=(26,8))

vg_len = len(trainGroup['Vintage-Group'])

g = sns.countplot(trainGroup['Vintage-Group'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
h = sns.countplot(trainGroup['Vintage-Group'], hue = trainGroup['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
g = sns.distplot(train['Vintage'], ax=ax[2])
g = sns.boxplot(train['Vintage'], orient='v', ax=ax[3])

plt.suptitle('Distribution Vintage',y=1, fontsize=24,color='dodgerblue',fontweight='bold');
	
fig.tight_layout()

plt.savefig('./vintage.jpg')
plt.show();

#### 4.3.1.5. Visualize Annual Premium

In [None]:
fig,ax = plt.subplots(1,4,figsize=(26,8))

apg_len = len(trainGroup['Annual_Premium-Group'])

g = sns.countplot(trainGroup['Annual_Premium-Group'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
h = sns.countplot(trainGroup['Annual_Premium-Group'], hue = trainGroup['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
g = sns.distplot(train['Annual_Premium'], ax=ax[2])
g = sns.boxplot(train['Annual_Premium'], orient='v', ax=ax[3])

plt.suptitle('Distribution Annual Premium',y=1, fontsize=24,color='dodgerblue',fontweight='bold');
	
fig.tight_layout()

plt.savefig('./annual-premium.jpg')
plt.show();

#### 4.3.1.6. Visualize Region Code

In [None]:
fig,ax = plt.subplots(2,figsize=(26,15),
                     sharey=True)

data_region_code = train[train.Region_Code.isin(train[['id','Region_Code']].groupby('Region_Code', as_index=False).count().sort_values('id', ascending=False).head(10).Region_Code.to_list())]

g = sns.countplot(data_region_code['Region_Code'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
ax[0].set_title('Distribution Region Code',fontsize=24,color='dodgerblue',fontweight='bold')

h = sns.countplot(data_region_code['Region_Code'],hue= data_region_code['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
ax[1].set_title('Response to Region Code',fontsize=24,color='dodgerblue',fontweight='bold')

fig.tight_layout();

### 4.3.2. Non-Numerical Data

#### 4.3.2.1. Visualize Gender

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,8),
                     sharey=True)

g_len = len(train['Gender'])

g = sns.countplot(train['Gender'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
h = sns.countplot(train['Gender'], hue = train['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
plt.suptitle('Response to Gender',y=1, fontsize=24,color='dodgerblue',fontweight='bold');

fig.tight_layout()

plt.savefig('./gender.jpg')
plt.show();

#### 4.3.2.2. Visualize Vehicle Age

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,8),
                     sharey=True)

va_len = len(train['Vehicle_Age'])

g = sns.countplot(train['Vehicle_Age'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")
    
h = sns.countplot(train['Vehicle_Age'], hue = train['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")


plt.suptitle('Response to Vehicle Age',y=1, fontsize=24,color='dodgerblue',fontweight='bold');

fig.tight_layout()

plt.savefig('./vehicle-age.jpg')
plt.show();

#### 4.3.2.3. Visualize Vehicle Damage

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,8),
                     sharey=True)

# Count Length Data Vehicle Damage
vd_len = len(train['Vehicle_Damage'])

g = sns.countplot(train['Vehicle_Damage'],palette=sns.color_palette("Set2"), ax=ax[0])
for i in g.patches:
    g.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")



h = sns.countplot(train['Vehicle_Damage'], hue = train['Response'],palette=sns.color_palette("husl", 8), ax=ax[1])
for i in h.patches:
    h.annotate(format(i.get_height(), ',.0f'), (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha = 'center', 
                va = 'center', 
                size=15,
                xytext = (0, 10), 
                textcoords = 'offset points',
                color='black',
                fontsize="15",
                fontweight="bold")

plt.suptitle('Response to Vehicle Damage',y=1, fontsize=24,color='dodgerblue',fontweight='bold')

fig.tight_layout()

plt.savefig('./vehicle-damage.jpg')
plt.show();

## 4.4. Multivariate Analysis

In [None]:
corr_= train.corr().round(3)
mask = np.zeros_like(corr_)
    
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(21, 10))
    ax = sns.heatmap(corr_, annot=True, cmap = "BuPu")

plt.tight_layout;
# plt.savefig('fig/matrix correlation.png');

# 5. Pre-Processing

## 5.1. Feature Engineering

### 5.1.1. Missing Value

In [None]:
# Finding Missing Value
data_missing_value = train.isnull().sum().reset_index()
data_missing_value.columns = ['feature','missing_value']
data_missing_value = data_missing_value[data_missing_value['missing_value'] > 0]

data_missing_value

### 5.1.2. Duplicate Value

In [None]:
train.duplicated().sum()

### 5.1.3. Scaling Use StandarScaler

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

ss = StandardScaler()

ss_list = [
    'Annual_Premium',
    'Vintage',
]

for x in ss_list :
    train[[x]] = ss.fit_transform(train[[x]])

### 5.1.4. Reformat Label

In [None]:
gd = {'Male' : 0, 'Female' : 1}
pi = {0 : 'No', 1 : 'Yes'}
train['Gender'] = train['Gender'].map(gd)
train['Previously_Insured'] = train['Previously_Insured'].map(pi)
train

In [None]:
# sns.boxplot(np.log(train['Annual_Premium']), orient='v')
# train['Annual_Premium'].describe()

### 5.1.5. One Hot Encoding

In [None]:
train_dummies = pd.get_dummies(train[[
    'Vehicle_Damage',
    'Previously_Insured',    
    'Vehicle_Age'
]])
# , drop_first=True
train_d = pd.concat([train, train_dummies], axis=1)
train_d.head()

### 5.1.6. Rename and Casting Feature

In [None]:
train_d=train_d.rename(columns={"Vehicle_Age_< 1 Year": "Vehicle_Age_lt_1_Year", "Vehicle_Age_> 2 Years": "Vehicle_Age_gt_2_Years"})

train_d['Vehicle_Age_lt_1_Year']=train_d['Vehicle_Age_lt_1_Year'].astype('int')
train_d['Vehicle_Age_gt_2_Years']=train_d['Vehicle_Age_gt_2_Years'].astype('int')
train_d['Vehicle_Damage_Yes']=train_d['Vehicle_Damage_Yes'].astype('int')

### 5.1.7. Drop Feature

In [None]:
train_d = train_d.drop([
    'id', 
    'Vehicle_Age',
    'Vehicle_Damage',
    'Previously_Insured',
    
    'Vehicle_Damage_No',    
    'Vehicle_Age_1-2 Year'
    
], axis=1)
train_d.head()

## 5.2. Splitting Values

In [None]:
train_pp = train_d

y = train_pp['Response'].values
X = train_pp.drop(labels = ['Response'], axis = 1)
print("Shape of X is {} and that of y is {}".format(X.shape, y.shape))

# Splitting the dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

print('Shape of training set ', X_train.shape)
print('Shape of test set ', X_test.shape)

# 6. Modeling

## 6.1. Define Model

In [None]:
classifications = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(max_depth= 2, eta= 1, objective= 'binary:logistic')
]

result_model = pd.DataFrame(columns = ['Method', 'roc_auc_score'])
result_model

## 6.2. Modeling Process

In [None]:
for model in classifications:
    model.fit(X_train, y_train)
    y_score = model.predict_proba(X_test)[:,1]
    
    method = str(type(model)).split('.')[-1][:-2]
    
    #roc_auc_score
    roc_auc_score_ = roc_auc_score(y_test, y_score)
    roc_auc_score_ = roc_auc_score_.item()
    
    result_model = result_model.append({
        'Method': method,
        'roc_auc_score': roc_auc_score_,
    },ignore_index=True)
    
result_model

## 6.3. Cross Validation

In [None]:
rf = XGBClassifier(max_depth= 2, eta= 1, objective= 'binary:logistic')
rf.fit(X_train, y_train)

kfold = model_selection.KFold(n_splits=10, random_state=41)

# Get score:
results_k = model_selection.cross_val_score(rf, X, y, cv=kfold, scoring='roc_auc')
results_k

## 6.4. Train, Test, Val Score

In [None]:
from sklearn.model_selection import StratifiedKFold

roc_auc_list = []
roc_auc_holdout = []
roc_auc_train = []
folds = []
model = XGBClassifier(max_depth= 2, eta= 1, objective= 'binary:logistic')
kfold = KFold(n_splits=10, random_state=41)
for i, (train_index, test_index) in enumerate(kfold.split(X_train)):
    X1_train, X1_valid = X.iloc[train_index], X.iloc[test_index]
    y1_train, y1_valid = y[train_index], y[test_index]
    model.fit(X1_train, y1_train)
    train_pred = model.predict_proba(X1_train)[:,1] # 70%
    #Measure of the fit of your model.
    pred = model.predict_proba(X1_valid)[:,1] # 10%
    # DATA WHICH MODEL HAS NOT SEEN
    pred_holdout = model.predict_proba(X_test)[:,1] # 20%
    
    print('Prediction length on validation set, XGBoost Classifier, fold ', i, ': ', len(pred))

    folds.append(i)
    roc_auc_list.append(roc_auc_score(y1_valid, pred))
    roc_auc_holdout.append(roc_auc_score(y_test, pred_holdout))
    roc_auc_train.append(roc_auc_score(y1_train, train_pred))

In [None]:
roc_auc_train # train

In [None]:
roc_auc_holdout # test

In [None]:
roc_auc_list # val

## 6.5. Visualize Train, Test, Val Score

In [None]:
# import matplotlib as mpl
# matplotlib.rcParams.update(mpl.rcParamsDefault)

In [None]:
rg = np.arange(0.840,0.900,0.005)

train_mean = np.mean(roc_auc_train)
test_mean = np.mean(roc_auc_holdout)
val_mean = np.mean(roc_auc_list)

train_std = np.std(roc_auc_train)
test_std = np.std(roc_auc_holdout)
val_std = np.std(roc_auc_list)

plt.style.use('tableau-colorblind10')

fig, ax = plt.subplots(figsize=(20,10))
ax.plot(roc_auc_train, label='Train', marker='o', linestyle='-.')
ax.plot(roc_auc_holdout, label='Test', marker='o', linestyle=':')
ax.plot(roc_auc_list, label='Val', marker='o', linestyle='--')

text_m = '''
    * Train Mean : ''' + str(format(train_mean, '.5f')) + '''
    * Test Mean : ''' + str(format(test_mean, '.5f')) + ''' 
    * Val Mean : ''' + str(format(val_mean, '.5f')) + '''     
'''

ax.text(6,0.841,text_m,horizontalalignment='left',color='black',fontsize=16,fontweight='normal')


text_s = '''
    * Train Standard Deviation : ''' + str(format(train_std, '.5f')) + '''
    * Test Standard Deviation : ''' + str(format(test_std, '.5f')) + ''' 
    * Val Standard Deviation : ''' + str(format(val_std, '.5f')) + '''     
'''

ax.text(0.5,0.841,text_s,horizontalalignment='left',color='black',fontsize=16,fontweight='normal')


ax.set_xlabel('No of variable at each split', fontsize=18, labelpad=20)
ax.set_ylabel('ROC_AUC Score', fontsize=18, labelpad=10)

ax.set_title('XGBoost - Train, Test, Val Error', pad=20, fontsize=30)

ax.legend()
ax.set_yticks(rg)

sns.despine()

plt.savefig('./xgb-ttv.jpg')

plt.tight_layout()

plt.show();

# 7. Feature Importance

In [None]:
#
clf = XGBClassifier()
clf.fit(X_train, y_train)

#
clf.feature_importances_

#
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
feature_importances

fig, ax = plt.subplots(1,1, figsize=(10,15))
sns.barplot(x='importance', y='index', color='#800000',data=feature_importances.reset_index());

plt.title('Feature Importance', fontsize=30, pad=20)


plt.savefig('./feature-importance.jpg')
plt.tight_layout()
plt.show();

# 8. Tuning Hyperparameter (One Time Running)

In [None]:
# from sklearn.model_selection import XGBClassifier

# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'max_depth': [10,15],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [3,4,5],
#     'n_estimators': [100, 200, 300]
# }
# # Create a based model
# rf = RandomForestClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

# # Fit the grid search to the data
# grid_search.fit(X_train,y_train)

In [None]:
# grid_search.best_estimator_

In [None]:
# rff = RandomForestClassifier(max_depth=10, max_features=2, min_samples_leaf=5,
#                        min_samples_split=3)
# rff.fit(X_train, y_train)
# y_pred = rff.predict_proba(X_test)

# print('Random Forest Classifier ', roc_auc_score_)

In [None]:
# result_model