# Modelling on Unbalanced Data: Caravan Insurance

Imports

In [None]:
#%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,classification_report,f1_score
from lightgbm import LGBMClassifier
import itertools
import scipy.stats as ss
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




In [None]:
RS=410 #Random State

In [None]:
data=pd.read_csv('/kaggle/input/caravan-insurance-challenge/caravan-insurance-challenge.csv')
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.ORIGIN.value_counts()

Data Analysis

No NA values,all variables are type of int64.The data is peculiar in that every numeric stands for an attribute of person.Even variables that could be continuous,such as income have been binned.

In [None]:
data.info()

Every feature is already encoded as an integer representation,saving us the conversion work.

In [None]:
data.CARAVAN.value_counts()

Now,we are dealing with a very imbalanced dataset.

In [None]:
plt.subplots(figsize=(10,8))
sns.heatmap(data.drop(columns=['ORIGIN']).corr());

A correlation plot shows some interesting patterns in the data.There is a clear divide between the two groupings listed in the description file . 

In [None]:
fig,axes=plt.subplots(1,2,figsize=(12,8))
sns.heatmap(data.drop(columns=["ORIGIN"]).iloc[:,:43].corr(),vmin=-1,vmax=1,cmap='coolwarm',ax=axes[0])
sns.heatmap(data.drop(columns=['ORIGIN']).iloc[:,43:].corr(),vmin=-1,vmax=1,cmap='coolwarm',ax=axes[1])
axes[0].set_title("Upper-left Corrplot")
axes[1].set_title("Bottom-right Corrplot")

after zooming in bit,Bottom-right corrplot shows how variables starting with P each have a corresponding variable starting with A this means that having both in our data will likely provide little value.

In [None]:
#Drop percentage representations
data_np=data.drop(columns=data.loc[:,(data.columns.str.startswith('p'))]).copy()
data_np.to_feather('reduced_cmbd.df')

In [None]:
!pip install pyarrow

**MODELS**



4 Models will be used in total:BaggingClassifier,RandomForestClassifier,AdaBoostClassifier from sklearn and Microsoft's lightgbm

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False, cf_report=False,
                          title='Confusion matrix', ax=None, cmap=plt.cm.Blues, cbar=False):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    if cf_report:
        print(classification_report(y_true,y_pred))
    
    fig, ax = (plt.gcf(), ax) if ax is not None else plt.subplots(1,1)
    
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)
    
    if cbar:
        fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04) # "Magic" numbers (https://stackoverflow.com/a/26720422/10939610)
    
    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    fig.tight_layout()
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')

In [None]:
def plot_roc(y_true, y_pred, ax=None):
    """Plot ROC curve""" 
    false_positive_rate, true_positive_rate, threshold = roc_curve(y_true, y_pred)
    roc_score = roc_auc_score(y_true,y_pred)
    
    fig, ax = (plt.gcf(), ax) if ax is not None else plt.subplots(1,1)

    ax.set_title("Receiver Operating Characteristic")
    ax.plot(false_positive_rate, true_positive_rate)
    ax.plot([0, 1], ls="--")
    ax.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
    ax.annotate('ROC: {:.5f}'.format(roc_score), [0.75,0.05])
    ax.set_ylabel("True Positive Rate")
    ax.set_xlabel("False Positive Rate")
    fig.tight_layout()
    return roc_score

In [None]:
def feat_imps(model, X_train, plot=False, n=None):
    """ Dataframe containing each feature with its corresponding importance in the given model
    
    Args
    ----
        model : model, classifier that supports .feature_importances_ (RandomForest, AdaBoost, ect..)
        X_train : array like, training data object
        plot : boolean, if True, plots the data in the form of a bargraph
        n : int, only applicable if plot=True, number of features to plot, (default=15)
        
    Returns
    -------
        pandas DataFrame : columns = feature name, importance
    """
    
    fi_df = pd.DataFrame({'feature':X_train.columns,
                          'importance':model.feature_importances_}
                        ).sort_values(by='importance', ascending=False)
    if plot:
        fi_df[:(n if n is not None else 15)].plot.bar(x='feature',y='importance')
    else:
        return fi_df

In [None]:
def plot_cmroc(y_true, y_pred, classes=[0,1], normalize=True, cf_report=False):
    """Convenience function to plot confusion matrix and ROC curve """
    fig,axes = plt.subplots(1,2, figsize=(9,4))
    plot_confusion_matrix(y_true, y_pred, classes=classes, normalize=normalize, cf_report=cf_report, ax=axes[0])
    roc_score = plot_roc(y_true, y_pred, ax=axes[1])
    fig.tight_layout()
    plt.show()
    return roc_score

In [None]:
train_df=data.query("ORIGIN=='train'").iloc[:,1:].copy()
test_df=data.query("ORIGIN=='test'").iloc[:,1:].copy()

The test data will be treated as holdout test set,so we will split train_df into a training validation set.

In [None]:
X,y=train_df.drop(columns='CARAVAN'),train_df.CARAVAN
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.20,random_state=RS)

To address the issue with imbalanced data,we will compare three approaches for each model used:
1.Random Over Sampling
2.Random Under Sampling
3.SMOTE(Synthetic Minority Over-Sampling Technique)


In [None]:
!pip install imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
ros=RandomOverSampler(random_state=RS)
rus=RandomUnderSampler(random_state=RS)
smt=SMOTE(random_state=RS,n_jobs=-1)
X_under,y_under=rus.fit_sample(X_train,y_train)
X_over,y_over=ros.fit_sample(X_train,y_train)
X_smote,y_smote=smt.fit_sample(X_train,y_train)
pd.DataFrame([*map(lambda x:ss.describe(x)._asdict(),[y_train,y_under,y_over,y_smote])],index=['Unbalanced','Undersample','Oversample','SMOTE'])

Without doing any sort of resampling,the mean was ~0.058 with heavy skew.Each method of resampling has shifted the mean to 0.5 and eliminated the skewness,each using a different method to achieve this.


In [None]:
#Define baseline models
bc=BaggingClassifier(n_estimators=53,random_state=RS,n_jobs=-1)
ada=AdaBoostClassifier(n_estimators=53,random_state=RS)
rfc=RandomForestClassifier(n_estimators=53,random_state=RS,n_jobs=-1)
lgbm=LGBMClassifier(n_estimators=53,random_state=RS,n_jobs=-1)

Unbalanced Data

Bagging

In [None]:
bc_unbal=plot_cmroc(y_val,bc.fit(X_train,y_train).predict(X_val))

Boosting(AdaBoost)


In [None]:
ada_unbal=plot_cmroc(y_val,ada.fit(X_train,y_train).predict(X_val))

Boosting(LGBM)

In [None]:
lgbm_unbal=plot_cmroc(y_val,lgbm.fit(X_train,y_train).predict(X_val))

Random Forest

In [None]:
rfc_unbal=plot_cmroc(y_val,rfc.fit(X_train,y_train).predict(X_val))

Unbalanced Evaluation

In [None]:
models=[bc,ada,rfc,lgbm]
unbal_scores=[bc_unbal,ada_unbal,rfc_unbal,lgbm_unbal]
for model,score in zip(models,unbal_scores):
    print('{:25s}:{:.5f}'.format(model.__class__.__name__,score))

Poor performance across all models when using the unbalanced dataset.AdaBoost was no better than random guessing and the best model,the BaggingClassifier.

# Undersampling

Bagging

In [None]:
bc_under=plot_cmroc(y_val,bc.fit(X_under,y_under).predict(X_val))

Boosting(AdaBoost)

In [None]:
ada_under=plot_cmroc(y_val,ada.fit(X_under,y_under).predict(X_val))

Boosting(LGBM)

In [None]:
lgbm_under=plot_cmroc(y_val,lgbm.fit(X_under,y_under).predict(X_val))

Random Forest

In [None]:
rfc_under=plot_cmroc(y_val,rfc.fit(X_under,y_under).predict(X_val))

Undersampling Evaluation

In [None]:
models=[bc,ada,rfc,lgbm]
under_scores=[bc_under,ada_under,rfc_under,lgbm_under]
for model,score in zip(models,under_scores):
    print('{:25s}:{:.5f}'.format(model.__class__.__name__,score))

Nearly a 18% increase in ROC score was seen across the board using the undersampling method.

# Oversampling

Bagging

In [None]:
bc_over=plot_cmroc(y_val,bc.fit(X_over,y_over).predict(X_val))

Boosting(AdaBoost)

In [None]:
ada_over=plot_cmroc(y_val,ada.fit(X_over,y_over).predict(X_val))

Boosting(LGBM)

In [None]:
lgbm_over=plot_cmroc(y_val,lgbm.fit(X_over,y_over).predict(X_val))

Random Forest

In [None]:
rfc_over=plot_cmroc(y_val,rfc.fit(X_over,y_over).predict(X_val))

# Oversampling Evaluation

In [None]:
models=[bc,ada,rfc,lgbm]
over_scores=[bc_over,ada_over,rfc_over,lgbm_over]
for model,score in zip(models,over_scores):
    print('{:25s}:{:.5f}'.format(model.__class__.__name__,score))

In contrast with the unbalanced dataset,with the over sampled data,AdaBoost greatly out performed the other models with this data augmentation method.

# SMOTE

Bagging


In [None]:
bc_smote=plot_cmroc(y_val,bc.fit(X_smote,y_smote).predict(X_val
                                                         ))

Boosting(AdaBoost)

In [None]:
ada_smote=plot_cmroc(y_val,ada.fit(X_smote,y_smote).predict(X_val))

Boosting(LGBM)

In [None]:
lgbm_smote=plot_cmroc(y_val,lgbm.fit(X_smote,y_smote).predict(X_val))

Random Forest

In [None]:
rfc_smote=plot_cmroc(y_val,rfc.fit(X_smote,y_smote).predict(X_val))

# SMOTE Evaluation

In [None]:
models=[bc,ada,rfc,lgbm]
smote_scores=[bc_smote,ada_smote,rfc_smote,lgbm_smote]
for model,score in zip(models,smote_scores):
    print('{:25s}:{:.5f}'.format(model.__class__.__name__,score))

# Tweaking the best

For all of the classifiers,Random under sampling was the most successful method of rebalancing the dataset.With the exception of AdaBoost,the other methods barely outperformed random guessing.

Let's evaluate the best from each group against the holdout test data 

In [None]:
X_test,y_test=test_df.iloc[:,:-1],test_df.iloc[:,-1]

In [None]:
bc=BaggingClassifier(n_estimators=53,n_jobs=-1)
ada=AdaBoostClassifier(n_estimators=53,random_state=RS)
rfc=RandomForestClassifier(n_estimators=53,n_jobs=-1,random_state=RS)
lgbm=LGBMClassifier(n_estimators=53,random_state=RS)


In [None]:
models=[bc,ada,rfc,lgbm]
for model in models:
    model.fit(X_under,y_under)
    tpreds=model.predict(X_test)
    print('{:25s}:{:.5f}'.format(model.__class__.__name__,roc_auc_score(y_test,tpreds)))

So,if this contest happened to evaluated on Area Under ROC,the best model we could have submitted would have been the Random Forest Classifier with a score of 0.66

A bit better of a score likely be achieved through ensembling these models as well,but there are many other tweaks.

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid={
    'learning_rate':[0.01,0.05,0.1,1],
    'n_estimators':[20,40,60,80,100],
    'num_values':[3,7,17,31],
    'max_bin':[4,8,16,32,64],
    'min_child_samples':[3,5,10,20,30],
}

In [None]:
lgbm_gs=GridSearchCV(LGBMClassifier(),param_grid,n_jobs=-1,scoring='roc_auc',verbose=2,iid=False,cv=5)
lgbm_gs.fit(X_under,y_under)
print('Best parameters:',lgbm_gs.best_params_)

In [None]:
plot_cmroc(y_val,lgbm_gs.predict(X_val))

In [None]:
plot_cmroc(y_test,lgbm_gs.predict(X_test))

# Random Forest

In [None]:
param_grid_rf={
    'n_estimators':[40,60,100,128,256],
    'min_samples_leaf':[3,7,17,31],
    'max_leaf_nodes':[4,8,16,32,64],
    'min_samples_split':[3,5,10,20,30],
}

In [None]:
rfc_gs=GridSearchCV(RandomForestClassifier(),param_grid_rf,n_jobs=-1,scoring='roc_auc',verbose=2,iid=False,cv=5)
rfc_gs.fit(X_under,y_under)
print('Best parameters:',rfc_gs.best_params_)

In [None]:
plot_cmroc(y_val,rfc_gs.predict(X_val))

In [None]:
plot_cmroc(y_test,rfc_gs.predict(X_test))

In [None]:
lgbm_gs_ub=GridSearchCV(LGBMClassifier(),param_grid,n_jobs=-1,scoring='roc_auc',verbose=1,iid=False,cv=5)
lgbm_gs_ub.fit(X_train,y_train)
print('Best parameters:',lgbm_gs_ub.best_params_)

In [None]:
plot_cmroc(y_val,lgbm_gs_ub.predict(X_val))

In [None]:
plot_cmroc(y_test,lgbm_gs_ub.predict(X_test))

# Dimensionality Reduction and Features Analysis

Dimensionality reduction with:
    
    1.Principal Component Analysis(PCA)
    2.t-SNE
    3.UMAP
As well as a few methods to feature selection:
   
    1.Stepwise Feature selection
    2.Recursive Feature elimination
    3.Feature Importance Analysis
Performed on a Logistic Regressor and Random Forest Classifier

In [None]:
import warnings
from sklearn.feature_selection import RFE,SelectKBest,chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import scipy.stats as ss
import joblib
from mlxtend.feature_selection import SequentialFeatureSelector



In [None]:
fi_data=data.drop(columns=['ORIGIN']).copy()
X,y=fi_data.drop(columns='CARAVAN'),fi_data.CARAVAN

# Exploratory Data Analysis


# Dimensionality Reduction

In [None]:
#plotting function
def scatter_density(data,labels,sca_title='',den_title="",**kwargs):
    """plot a scatter plot and a density plot Args:
             data:2-d array ,shape (n_samples,2)
             labels:array-like,class labels to be used for coloring scatterplot
              sca_title:str,scatter plot title
              den_title:str,density plot title
              **kwargs:keyword arguments passed to seaborn.
              Kdeplot
              Returns:
                     ax,matplotlib axis object"""
    fig,ax=plt.subplots(1,2,figsize=(10,4),sharey=True,sharex=True)
    #,gridspec_kw={'width_ratios':[50,50,4]}
    dataneg=data[labels==0]
    datapos=data[labels==1]
    sns.scatterplot(data[:,0],data[:,1],hue=labels,ax=ax[0])
    #sns.scatterplot(dataneg[:,0],dataneg[:,1],palette='Blues',ax=ax[0],alpha=0.06)
    #sns.scatterplot(datapos[:,0],datapos[:,1],palette='Oranges',ax=ax[0],alpha=1)
    sns.kdeplot(datapos[:,0],datapos[:,1],ax=ax[1],cmap='Oranges',**kwargs)
    sns.kdeplot(dataneg[:,0],dataneg[:,1],ax=ax[1],map='Blues',nlevels=30,**kwargs,shade=True,shade_lowest=False)#,cbar=True,cbar_ax=ax[2])
    ax[0].set_title(sca_title)
    ax[1].set_title(den_title)
    fig.tight_layout()
    plt.show()
    return ax

# PCA(principal Component analysis)

PCA is effected by differences in magnitude well begin by scaling the data.

In [None]:
from sklearn.decomposition import PCA
Xs=pd.DataFrame(StandardScaler().fit_transform(X),columns=X.columns)
pca=PCA(random_state=RS)
Xpca=pca.fit_transform(Xs)

implement PCA without proper Scaling

In [None]:

pca=PCA(random_state=RS)
_Xpca_raw=PCA(n_components=2,random_state=RS).fit_transform(X)
scatter_density(_Xpca_raw,y,'PCA Scatter Unscaled','PCA Density UnScaled');


The density plot shows a clear separation between two groups and even the scatter plot shows some degree of misleading grouping.Properly scaled ,things will look quite a bit different.


In [None]:
from sklearn.decomposition import PCA
Xs=pd.DataFrame(StandardScaler().fit_transform(X),columns=X.columns)
pca=PCA(random_state=RS)
Xpca=pca.fit_transform(Xs)

Xpca=pca.fit_transform(Xs)
scatter_density(Xpca,y,'PCA Scaled:Scatter','PCA Scaled:Density');

Now we are dealing with the accurate representation of the data,an amorphous point mass.That is why it is so important to check that the assumptions are model are met,otherwise it is all too easy to head down a path leading to dead ends or inavalid conclusions.

In [None]:
pca.explained_variance_ratio_[:3]

About 16% of variance can be explained by these first two abstract components.

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.annotate('(64,0.993)',xy=(64,0.993),xytext=(64,0.8),fontsize='medium',arrowprops={'arrowstyle':'->','mutation_scale':15})
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.title('Explained variance')
plt.show()

# t-SNE(T-distributed Stochastic Neighbor Embedding)


In [None]:
!pip install openTSNE

In [None]:
from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger
tsne=TSNE(perplexity=75,learning_rate=500,n_iter=1000,metric='euclidean',negative_gradient_method='bh',n_jobs=4,callbacks=ErrorLogger(),random_state=RS)
Xembd=tsne.fit(Xs)

In [None]:
scatter_density(Xembd,y,'t-SNE scatter','t-SNE density');

Although we do begin to see some small clusters taking shape.Depending on parameter choice,t-SNE has been shown to spuriously cluster.Highest density areas overlap between positive and negative saples and there are only a few small pockets where they have successfully separated.

In [None]:
!pip install 'umap-learn==0.3.10'

**UMAP(Uniform manifold Approximation and Projection)**

UMAP is relatively recent development in non-linear dimensionality reduction.

In [None]:
import umap.umap_ as umap

ump=umap.UMAP(n_neighbors=30,min_dist=0.2,random_state=RS,verbose=True)
Xumap=ump.fit_transform(Xs,y)

In [None]:
scatter_density(Xumap,y,'UMAP:Scatter','UMAP:Density')

In [None]:
ump=umap.UMAP(n_neighbors=30,min_dist=0.2,random_state=RS,verbose=False)
Xumap=ump.fit_transform(Xs)

In [None]:
ump=umap.UMAP(n_neighbors=30,min_dist=0.2,random_state=RS,verbose=False)
Xumap=ump.fit_transform(Xs)
scatter_density(Xumap,y,'UMAP:Scatter','UMAP:Density');