In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# ML
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, PowerTransformer, QuantileTransformer
from xgboost import XGBClassifier
# Imbalanced libraries
from imblearn.under_sampling import TomekLinks, NeighbourhoodCleaningRule, NearMiss, RepeatedEditedNearestNeighbours, RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.metrics import classification_report,roc_curve, auc, roc_auc_score
from imblearn.pipeline import Pipeline
# Explanations and graphs
import shap
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
!pip install shap -U

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="text-align: center; font-size:50px;"> Imbalanced assurance data</h1>

<hr>

## Table of contents

- [Exploration](#a)
- [Class Assurance](#b)
- [Explanations](#c)

## First, let explore the data: <a id=a></a>
<hr>

In [None]:
data = pd.read_csv('/kaggle/input/imbalanced-data-practice/aug_train.csv')
data_test = pd.read_csv('/kaggle/input/imbalanced-data-practice/aug_test.csv')

In [None]:
data.info()

### Correlation plots

The following figures give us the correlation between variables with kendall and spearman method:

In [None]:
plt.rcParams['figure.figsize'] = (15,15)
fig = plt.figure()

plt.gcf().subplots_adjust(wspace = 0, hspace = 0.3)

ax = fig.add_subplot(211)
sns.heatmap(data.corr(method='kendall'), annot=True, ax=ax)
plt.title('Correlation between variables with kendall method')
ax = fig.add_subplot(212)
sns.heatmap(data.corr(method='spearman'), annot=True, ax=ax)
plt.title('Correlation between variables spearman method')

### Parallel categories diagram

This categories parallel plot is very interesting to quickly understand the data. We can clearly observ the proportion of each class involved on the output response 1 (Positive response).

to understand the diagram:
 - $P(color (blue) \cap something (Male))$ means the proportion of Male with response 0 in total of people
 - $P(something (Male) | color (blue))$ means the proportion of Male in response 0
 - $P(color (blue) | something (Male))$ means the proportion of response 0 in Male

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact, interactive, fixed, interact_manual

lim1 = np.quantile(data.Annual_Premium,0.33)
lim2 = np.quantile(data.Annual_Premium,0.66)

data['Annual_Premium_cat'] = ['<'+str(lim1) if val<=lim1 else str(lim1)+"-"+str(lim2) if lim1<val<=lim2 else '>'+str(lim2) for val in data['Annual_Premium']]
data['Age_cat'] = ['<25' if val<=25 else '25-50' if 26<=val<=50 else '>50' for val in data.Age]
data['Region_cat'] = ['<15' if val<=15 else '15-30' if 16<=val<=30 else '>30' for val in data.Region_Code]

categorical_dimensions = ['Gender', 'Age_cat', 
                          'Driving_License','Vehicle_Age', 'Region_cat',
                          'Vehicle_Damage', 'Annual_Premium_cat', 'Response']

dimensions = [dict(values=data.loc[:,label], label=label) for label in categorical_dimensions]

response = data.Response
colorscale = [[0, 'lightsteelblue'], [1, 'mediumseagreen']]


fig = go.Figure(data = [go.Parcats(dimensions=dimensions,
        line={'color': response, 'colorscale': colorscale},
        hoveron='color', hoverinfo='count+probability',
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])

fig.update_layout(
        title="Parallel categories diagram",
        height=600, width=1200,
        dragmode='lasso', hovermode='closest')

fig.show()

### Violon plots

In [None]:
plt.rcParams['figure.figsize'] = (15,10)

for idx,plot in enumerate(['Annual_Premium', 'Policy_Sales_Channel', "Age", "Region_Code"]):
    plt.subplot(2,2,idx+1)
    sns.violinplot(data=data, x="Response", y=plot, hue='Gender',
                   split=True, palette="Set3", bw=.2, cut=1, linewidth=1)
    sns.despine(left=True)

### Distributions in 2D

In [None]:
plt.rcParams['figure.figsize'] = (15,20)

columns = [('Annual_Premium', 'Policy_Sales_Channel'),
           ('Annual_Premium', "Age"),
           ('Annual_Premium', "Region_Code"),
           ('Annual_Premium', 'Policy_Sales_Channel'),
           ('Policy_Sales_Channel', "Age"),
           ('Policy_Sales_Channel',"Region_Code"),
           ("Age", "Region_Code")]

for idx, col in enumerate(columns):
    plt.subplot(3,3,idx+1)
    cmap = sns.cubehelix_palette(start=2, light=1, as_cmap=True, rot=-.3)

    sns.kdeplot(
        data=data[:10000].query('Response==0')[[col[0],col[1]]],cmap=cmap, label='Response: No',
        cut=10,thresh=0, levels=15,      
    )
    sns.kdeplot(
        data=data[:10000].query('Response==1')[[col[0],col[1]]], color ='b', alpha=0.5, label='Response: Yes'  
    )
    if col[0] == 'Annual_Premium':
        plt.xlim(0,100000)

    plt.legend()
    plt.xlabel(col[0])
    plt.ylabel(col[1])

### Conclusion:

Same distributions for male and female. We can see that people around 40 years old are more likely to accept the assurance, while young people refuse it (violon plots). Annual premiums and Region codes have closed distributions for positive and negative responses, with a maximum for region codes around 30 and 9 and annual premium between 20000 and 60000 (dollars I supposed?). The proportion of positive response is higher with a policy sale channel around 125.

Finally, from previous plots, some patterns are defining a positive response: A man or woman aged 40 years old living in zone 30 or 10. Owner of a vehicle aged 2 years with an annual insurance between 20000 euros and 60000 euros who has already had an accident. Let's confirm that with a ML algorithm.

## Assurance class <a id=b></a>

In [None]:
shap.initjs()

class Assurance:
    """
    Class defining the methods usefull for the binary classification problem
    """
    
    def __init__(self, X, X_test, y, col_to_drop, xgb_kws={'learning_rate':0.01,'n_estimators':500, 'max_depth':8, 'scale_pos_weight':6}):
        """
        self.data, self.y: the inputs and outputs
        self.data_test, self.y_test: the test inputs and outputs
        self.cat_values and self.num_values: columns nouns for categories and numericals values respectively
        self.encoder: OrdianlEncoder
        self.model: XGBClassifier
        """
        self.data = X.drop([y]+col_to_drop, axis=1)
        self.data_test = X_test.drop([y]+col_to_drop, axis=1)
        self.y = X[y]
        self.y_test = X_test[y]
        
        self.cat_values = self.data.dtypes.loc[self.data.dtypes=='object'].index
        self.num_values = self.data.dtypes.loc[self.data.dtypes.isin(['int64','float64'])].index

        if len(self.num_values) == 0:
            self.cat_values = self.data.dtypes.loc[self.data.dtypes=='object'].index
            self.num_values = self.data.dtypes.loc[self.data.dtypes!='object'].index
        
        # Data augmentation
        for col in self.cat_values:
            for other_col in self.cat_values.drop([col]):
                self.data[col+'_'+other_col] = self.data[col]+'_'+self.data[other_col]
                self.data_test[col+'_'+other_col] = self.data_test[col]+'_'+self.data_test[other_col]
            self.cat_values = self.cat_values.drop([col])
        self.cat_values = self.data.dtypes.loc[self.data.dtypes=='object'].index
        
        self.encoder = OrdinalEncoder(
            categories = [pd.concat([self.data[col], self.data_test[col]],axis=0).unique() for col in self.cat_values])
        self.data[self.cat_values] = self.encoder.fit_transform(self.data[self.cat_values])
        self.data_test[self.cat_values] = self.encoder.fit_transform(self.data_test[self.cat_values])
        
        self.model = self.pipe(XGBClassifier(**xgb_kws, tree_method='gpu_hist', gpu_id=0, random_state=0, eval_metric='auc'))
        
    def outliers(self, col_out, n_neighbors=2, cls=0):
        """ Detecte the outliers among the majority class and remove them.
        
        parameters:
         - col_out: list[string].
           The detection of outliers will be focused on the selected columns.
         - n_neighbors: int.
           Define the number of neighbors to take into account for the calculation of the density factor
        """
        
        if cls == 0: 
            cls_ = 1
        else: 
            cls_ = 0
            
        outliers = LocalOutlierFactor(n_neighbors=n_neighbors)
        data_out = outliers.fit_predict(self.data[self.num_values].loc[self.y==cls])
        self.data = pd.concat([self.data.loc[self.y==cls].loc[data_out==1], self.data.loc[self.y==cls_]], axis=0)
        self.y = pd.concat([self.y.loc[self.y==cls].loc[data_out==1], self.y.loc[self.y==cls_]], axis=0)
    
    def imb_prepro(self):
        for estimator in [SMOTE(k_neighbors=10), TomekLinks()]:
            self.data, self.y = estimator.fit_resample(self.data, self.y)
        
    def pipe(self, estimator):
        """ Return pipeline
        
        parameter:
         - Estimator: estimator object
           Define the estimator in the pipeline
        """
        
        pipe = Pipeline([
                         ('model', estimator)])
        return pipe
             
    def best_params(self, **kwargs):
        """ Find the best parameters of the XGBClassifier.
        
        parameters:
         - params: dict.
           Collection of XGBClassifier paramaters.
        """
        
        params_grid = {'model__n_estimators':[150,300,500],
                       'model__max_depth': [5,6,8],
                       'model__lambda': [1,2,3],
                       'model__scale_pos_weight':[3,6,9],
                       'model__learning_rate':[0.005,0.01,0.1]}
        
        params = kwargs.get('params',params_grid)
        
        self.model = GridSearchCV(self.model, param_grid=params_grid, cv=5, scoring='roc_auc')
        self.model.fit(self.data, self.y, model__eval_set=[(self.data_test, self.y_test)], model__early_stopping_rounds=50)
        self.model = self.model.best_estimator_
        print('Best estimator: ',self.model.best_estimator_)
        print('Best params: ',self.model.best_params_)
        print('Best score: ',self.model.best_score_)

    def train(self):
        """Training the model"""
        
        self.model.fit(self.data, self.y)
        
    def score(self, tresh=0.5):
        """print the classification_report for the training and test sets"""
        
        print('Test set')
        out = self.model.predict_proba(self.data_test)
        print(classification_report(self.y_test, output(out, tresh)))
        print('Training set')
        out = self.model.predict_proba(self.data)
        print(classification_report(self.y, output(out, tresh)))        
        
    def graphs_score(self, cls=0):
        """Plot the ROC curve for a given classe and confusion matrix
        
        parameter:
         - cls: int(0 or 1)
           classes
        """
        
        tn, fp, fn, tp = confusion_matrix(self.model.predict(self.data_test), self.y_test).ravel()
        total = tn + fp + fn + tp
        conf_matrix = pd.DataFrame([[tn/total,fp/total],[fn/total,tp/total]], index=[0,1], columns=[0,1])
        
        sns.heatmap(conf_matrix, annot=True)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion matrix in percentage')
        plt.show()
        
        y_score = self.model.predict_proba(self.data_test)
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in [0,1]:
            fpr[i], tpr[i], _ = roc_curve(self.y_test, y_score[:,i], pos_label=i)
            roc_auc[i] = auc(fpr[i], tpr[i])
            
        plt.figure()
        lw = 2
        plt.plot(fpr[cls], tpr[cls], color='darkorange',
                 lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[cls])
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Assurance ROC curve')
        plt.legend(loc="lower right")
        plt.show()
        
def output(y, treshold):
    y = [1 if el>treshold else 0 for el in y[:,1]]
    return y
    

Let's apply the previous methods

In [None]:
data = pd.read_csv('/kaggle/input/imbalanced-data-practice/aug_train.csv')

# Defining our test and training set 
class1 = data.loc[data.Response==1].sample(frac = 1, random_state=0)
class2 = data.loc[data.Response==0].sample(frac = 1, random_state=0)
data_train = pd.concat([class1.iloc[:int(len(class1)*0.8)], class2.iloc[:int(len(class2)*0.8)]], axis=0)
data_test = pd.concat([class1.iloc[int(len(class1)*0.8):], class2.iloc[int(len(class2)*0.8):]], axis=0)

ob = Assurance(data_train, data_test, 'Response', ['id','Previously_Insured'])
ob.outliers(ob.num_values) # Remove outliers in classe 0
ob.outliers(ob.num_values, cls=1) # Remove outliers in classe 1
ob.imb_prepro()
#ob.best_params() # Calcul the best parameters
ob.train()

The following graphs are very usefull to evaluate our scores

In [None]:
ob.score()
ob.graphs_score(cls=1)

Recall metric is really important for our task because we need to classify the maximum of positive response. However we need to keep a good rate between false and positive classification, that's why we maximize the ROC AUC metric for our best params search

### Translation

Before any explanation we need to inverse translate categories values 

In [None]:
pd.DataFrame(pd.DataFrame(ob.encoder.get_params()['categories'],
                         index=ob.cat_values).transpose().fillna(method='ffill'))

In [None]:
ob.encoder.transform(pd.DataFrame(pd.DataFrame(ob.encoder.get_params()['categories']).transpose().fillna(method='ffill')))

## Explanations <a id=c></a>

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(ob.model, scoring='roc_auc', random_state=1).fit(ob.data_test, ob.y_test)
eli5.show_weights(perm, feature_names = ob.cat_values.tolist()+ob.num_values.tolist())

Vehicle_age_vehicle_damage which is a created column is the most important variable, it has the higher impact on the score metric

In [None]:
def pdp_plt(col):
    pdp_assurance = pdp.pdp_isolate(model=ob.model, dataset=ob.data_test[:10000], model_features=ob.data_test.columns, feature=col)

    pdp.pdp_plot(pdp_assurance, col, plot_lines=True)
    plt.show()
    
pdp_plt('Age')

Thank to that function we can this the partial dependence plot for each column

In [None]:
explainer = shap.Explainer(ob.model['model'], ob.data_test)
shap_values = explainer(ob.data_test)

Force plot

In [None]:
shap.force_plot(explainer.expected_value, shap_values[:5000,:].values, ob.data_test.iloc[:5000].values, feature_names= ob.data.columns, link='logit')

Beeswarm

In [None]:
shap.plots.beeswarm(shap_values, max_display=14)

Bar plot

In [None]:
clustering = shap.utils.hclust(ob.data_test[:100000], ob.y_test[:100000])
shap.plots.bar(shap_values, clustering=clustering)

Heatmap

In [None]:
shap.plots.heatmap(shap_values[:1000])

### Conclusion:
    
Remainder: A man or woman aged 40 years old living in zone 30 or 10. Owner of a vehicle aged 2 years with an annual insurance between 20000 euros and 60000 euros who has already had an accident. Let's confirm that with a ML algorithm.

Actually, we can find same patterns with beeswarm plot, bar plots or pdp plot. For exemple, age has a bigger impact between 30 and 50 as we can see on pdp plot.