# Bank Marketing Dataset-UCI Machine learning reporsitory

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score,roc_auc_score

In [2]:
data=pd.read_csv('bank-additional-full.csv',sep=';')

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

### Pre-processing 1

Label encoding of Education and One-hot encoding of categorical fields

In [5]:
class Feature_engineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        return

    def fit(self, X=data, y=None):
        df=X.copy()
        if('education' in df.columns):
            df.education=preprocessing.LabelEncoder().fit_transform(df.education)
        cols=[]
        for i in df.columns:
            if(df[i].dtype=='O'):
                cols.append(i)
        data_dummies=pd.get_dummies(df,prefix=cols,columns=cols)
        self.data_columns=data_dummies.columns
        return self
        

    
    def transform(self,X=data):
        df=X.copy()
        if('education' in df.columns):
            df.education=preprocessing.LabelEncoder().fit_transform(df.education)
        cols=[]
        for i in df.columns:
            if(df[i].dtype=='O'):
                cols.append(i)
        data_dummies=pd.get_dummies(df,prefix=cols,columns=cols)
        data_dummies=data_dummies.reindex(columns = self.data_columns, fill_value=0)
        df=df.drop(cols,axis=1)
        x = pd.concat([df, data_dummies], axis=1)
        x=x.loc[:,~x.columns.duplicated()]
        return x
    def transform_y(data):
        y_dict={
            'yes':1,
            'no':0
        }
        data=data.map(y_dict)
        return data

### Pre-processing 2


Ordinal encoding of Education and One-hot encoding of categorical fields

In [21]:
class Feature_engineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        return

    def fit(self, X=data, y=None):
        data=X.copy()
        edu_dict={
        'unknown':0,
        'illiterate':1,

        'basic.4y':2,
        'basic.6y':3,
        'basic.9y':4, 
        'high.school':5, 
           'professional.course':6,
        'university.degree':7
        }
        data['education']=data['education'].map(edu_dict)
        cols=[]
        cols=[]
        for i in data.columns:
            if(data[i].dtype=='O'):
                cols.append(i)
        data_dummies=pd.get_dummies(data,prefix=cols,columns=cols)
        self.data_columns=data_dummies.columns
        return self
        

    
    def transform(self,X=data):
        data=X.copy()
        edu_dict={
        'unknown':0,
        'basic.4y':1,
        'basic.6y':2,
        'basic.9y':3, 
        'high.school':4, 
        'illiterate':5,
           'professional.course':6,
        'university.degree':7
        }
        data['education']=data['education'].map(edu_dict)
        cols=[]
        for i in data.columns:
            if(data[i].dtype=='O'):
                cols.append(i)
        data_dummies=pd.get_dummies(data,prefix=cols,columns=cols)
        data_dummies=data_dummies.reindex(columns = self.data_columns, fill_value=0)
        data=data.drop(cols,axis=1)
        x = pd.concat([data, data_dummies], axis=1)
        x=x.loc[:,~x.columns.duplicated()]
        return x
    def transform_y(data):
        y_dict={
            'yes':1,
            'no':0
        }
        data=data.map(y_dict)
        return data

In [6]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score),
          'roc_auc':make_scorer(roc_auc_score)}

### Strarified K fold Cross-validation on Pipeline including  Feature Engineering and Decision Tree

In [24]:
def decision_tree_pipeline(X,y):
    cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipeline = Pipeline([('Pre-processing',Feature_engineering()),('Tree',DecisionTreeClassifier(random_state=0,max_depth=5))])
    scores = cross_validate(pipeline,X,y,cv=cv,scoring=scoring,return_train_score=True,return_estimator=True)
    print("Mean training ROC AUC score: ",np.mean(scores['train_roc_auc'])," and mean testing ROC AUC score: ",np.mean(scores['test_roc_auc']))
    print("Mean training accuracy score: ",np.mean(scores['train_accuracy'])," and mean testing accuracy score: ",np.mean(scores['test_accuracy']))
    print("Mean training recall score: ",np.mean(scores['train_recall'])," and mean testing recall score: ",np.mean(scores['test_recall']))
    print("Mean training precision score: ",np.mean(scores['train_precision'])," and mean testing precision score: ",np.mean(scores['test_precision']))
    print("Mean training f1 score: ",np.mean(scores['train_f1_score'])," and mean testing f1 score: ",np.mean(scores['test_f1_score']))

In [25]:
data=pd.read_csv('bank-additional-full.csv',sep=';')
X=data.iloc[:,:-1]
y=Feature_engineering.transform_y(data.iloc[:,-1])
decision_tree_pipeline(X,y)

Mean training ROC AUC score:  0.7477277851778261  and mean testing ROC AUC score:  0.740179047828701
Mean training accuracy score:  0.9173977325278024  and mean testing accuracy score:  0.9142228866076383
Mean training recall score:  0.5287116858237548  and mean testing recall score:  0.5155172413793102
Mean training precision score:  0.6693454922049513  and mean testing precision score:  0.6519437351574204
Mean training f1 score:  0.5900704096553948  and mean testing f1 score:  0.5748945970917563


### Strarified K fold Cross-validation on pipeline including Feature Engineering and Random Forest


In [26]:
 def random_forest_pipeline(X,y):
    cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    pipeline = Pipeline([('Pre-processing',Feature_engineering()),('forest',RandomForestClassifier(n_estimators=100,random_state=42,max_depth=5))])
    scores = cross_validate(pipeline,X,y,cv=cv,scoring=scoring,return_train_score=True,return_estimator =True)
    
    print("Mean training ROC AUC score: ",np.mean(scores['train_roc_auc'])," and mean testing ROC AUC score: ",np.mean(scores['test_roc_auc']))
    print("Mean training accuracy score: ",np.mean(scores['train_accuracy'])," and mean testing accuracy score: ",np.mean(scores['test_accuracy']))
    print("Mean training recall score: ",np.mean(scores['train_recall'])," and mean testing recall score: ",np.mean(scores['test_recall']))
    print("Mean training precision score: ",np.mean(scores['train_precision'])," and mean testing precision score: ",np.mean(scores['test_precision']))
    print("Mean training f1 score: ",np.mean(scores['train_f1_score'])," and mean testing f1 score: ",np.mean(scores['test_f1_score']))
    return scores

Initially, use all the columns to fit the model

In [27]:
data=pd.read_csv('bank-additional-full.csv',sep=';')
X=data.iloc[:,:-1]
y=Feature_engineering.transform_y(data.iloc[:,-1])
scores=random_forest_pipeline(X,y)

Mean training ROC AUC score:  0.5968306729125565  and mean testing ROC AUC score:  0.5936002924291877
Mean training accuracy score:  0.9041036781319226  and mean testing accuracy score:  0.9028601273360838
Mean training recall score:  0.20019157088122602  and mean testing recall score:  0.19439655172413792
Mean training precision score:  0.7956746953504387  and mean testing precision score:  0.7766896102472217
Mean training f1 score:  0.3198699293295462  and mean testing f1 score:  0.3105786397261269


#### Generating the feature  importance dataframe for each of the K estimators

In [28]:
feature_importances=pd.DataFrame()
for idx,estimator in enumerate(scores['estimator']):
    ft_estimator=pd.DataFrame({
        'columns order for estimator_{}'.format(idx):Feature_engineering().fit_transform(X).columns,
        'importance_estimator_{}'.format(idx):estimator.steps[1][1].feature_importances_}).sort_values(['importance_estimator_{}'.format(idx)], ascending=False)
    ft_estimator.reset_index(inplace=True,drop=True)
    feature_importances=pd.concat([feature_importances,ft_estimator],axis=1)

In [29]:
feature_importances.head()

Unnamed: 0,columns order for estimator_0,importance_estimator_0,columns order for estimator_1,importance_estimator_1,columns order for estimator_2,importance_estimator_2,columns order for estimator_3,importance_estimator_3,columns order for estimator_4,importance_estimator_4,columns order for estimator_5,importance_estimator_5,columns order for estimator_6,importance_estimator_6,columns order for estimator_7,importance_estimator_7,columns order for estimator_8,importance_estimator_8,columns order for estimator_9,importance_estimator_9
0,duration,0.269182,duration,0.261184,duration,0.263069,duration,0.275963,duration,0.241223,duration,0.258903,duration,0.266456,duration,0.258463,duration,0.27263,duration,0.265974
1,nr.employed,0.160323,nr.employed,0.157813,nr.employed,0.15288,nr.employed,0.150097,nr.employed,0.150411,nr.employed,0.157141,nr.employed,0.172784,nr.employed,0.160745,nr.employed,0.152073,nr.employed,0.155214
2,euribor3m,0.124312,euribor3m,0.132867,euribor3m,0.137112,euribor3m,0.130091,euribor3m,0.136773,euribor3m,0.125494,euribor3m,0.123476,euribor3m,0.120444,euribor3m,0.121993,euribor3m,0.117847
3,pdays,0.090368,pdays,0.092798,pdays,0.085341,pdays,0.086034,pdays,0.085837,pdays,0.101099,pdays,0.090715,pdays,0.087696,pdays,0.088454,pdays,0.089227
4,cons.conf.idx,0.064923,poutcome_success,0.067264,cons.conf.idx,0.065717,cons.conf.idx,0.066146,cons.conf.idx,0.074167,emp.var.rate,0.060419,cons.conf.idx,0.061635,poutcome_success,0.071095,cons.conf.idx,0.066484,poutcome_success,0.0686


<blockquote>Enter specific columns obtained from feature importance dataframe and train the model on them.
<p>In this case we find the columns 'pdays', 'duration', 'euribor3m', 'cons.conf.idx', 'nr.employed' as the most important features</p></blockquote>

In [30]:
data=pd.read_csv('bank-additional-full.csv',sep=';')
y=Feature_engineering.transform_y(data.iloc[:,-1])
imp_cols=['pdays','duration','euribor3m','cons.conf.idx','nr.employed'] #columns obtained from feature importances
X=data.loc[:,imp_cols]
scores=random_forest_pipeline(X,y)

Mean training ROC AUC score:  0.6898457971188521  and mean testing ROC AUC score:  0.6829390088032005
Mean training accuracy score:  0.9139231531588564  and mean testing accuracy score:  0.9114791308735114
Mean training recall score:  0.4005986590038314  and mean testing recall score:  0.38793103448275856
Mean training precision score:  0.7091834662469677  and mean testing precision score:  0.6924859828756309
Mean training f1 score:  0.5114896909042359  and mean testing f1 score:  0.4964042471037581


<center><b>Note:</b> We see an improvement in training score and testing score on using those specific columns</center>

### Stratified K-fold CV on Pipeline inclduing Feature Engineering, SMOTE , Undersampling and Random Forest


In [19]:
def random_forest_pipeline(X,y):
    from imblearn.pipeline import Pipeline

    over = SMOTE(sampling_strategy=0.2)
    under = RandomUnderSampler(sampling_strategy=0.5)
    pipeline = Pipeline([('Pre-processing',Feature_engineering()),('over', over),('under', under),('forest',RandomForestClassifier(n_estimators=50,random_state=42,max_depth=5))])
    pipeline = pipeline.fit(X,y)
    scores = cross_validate(pipeline,X,y,cv=10,scoring=scoring,return_train_score=True,return_estimator =True)
    
    print("Mean training ROC AUC score: ",np.mean(scores['train_roc_auc'])," and mean testing ROC AUC score: ",np.mean(scores['test_roc_auc']))
    print("Mean training accuracy score: ",np.mean(scores['train_accuracy'])," and mean testing accuracy score: ",np.mean(scores['test_accuracy']))
    print("Mean training recall score: ",np.mean(scores['train_recall'])," and mean testing recall score: ",np.mean(scores['test_recall']))
    print("Mean training precision score: ",np.mean(scores['train_precision'])," and mean testing precision score: ",np.mean(scores['test_precision']))
    print("Mean training f1 score: ",np.mean(scores['train_f1_score'])," and mean testing f1 AUC score: ",np.mean(scores['test_f1_score']))
    return scores

Initially, use all the columns to fit the model

In [20]:
data=pd.read_csv('bank-additional-full.csv',sep=';')

X=data.iloc[:,:-1]
y=Feature_engineering.transform_y(data.iloc[:,-1])
scores=random_forest_pipeline(X,y)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Mean training ROC AUC score:  0.7737775469342563  and mean testing ROC AUC score:  0.5694342785898108
Mean training accuracy score:  0.8994043047027918  and mean testing accuracy score:  0.7399345845270281
Mean training recall score:  0.6116139846743296  and mean testing recall score:  0.3493534482758621
Mean training precision score:  0.5584836848900292  and mean testing precision score:  0.2296286206943939
Mean training f1 score:  0.5802079050388178  and mean testing f1 AUC score:  0.16393654602435465


<u>To deal with class imbalance , we oversample the minority class and undersampling the majority class</u>

#### Generating the feature importance dataframe for each of the K estimators

In [21]:
feature_importances=pd.DataFrame()
for idx,estimator in enumerate(scores['estimator']):
    ft_estimator=pd.DataFrame({
        'columns order for estimator_{}'.format(idx):Feature_engineering().fit_transform(X).columns,
        'importance_estimator_{}'.format(idx):estimator.steps[3][1].feature_importances_}).sort_values(['importance_estimator_{}'.format(idx)], ascending=False)
    ft_estimator.reset_index(inplace=True,drop=True)
    feature_importances=pd.concat([feature_importances,ft_estimator],axis=1)

In [22]:
feature_importances.head()

Unnamed: 0,columns order for estimator_0,importance_estimator_0,columns order for estimator_1,importance_estimator_1,columns order for estimator_2,importance_estimator_2,columns order for estimator_3,importance_estimator_3,columns order for estimator_4,importance_estimator_4,columns order for estimator_5,importance_estimator_5,columns order for estimator_6,importance_estimator_6,columns order for estimator_7,importance_estimator_7,columns order for estimator_8,importance_estimator_8,columns order for estimator_9,importance_estimator_9
0,duration,0.24974,duration,0.227809,duration,0.283479,duration,0.290085,duration,0.238586,duration,0.27416,duration,0.258069,duration,0.273783,duration,0.290195,duration,0.239863
1,nr.employed,0.133652,nr.employed,0.16108,nr.employed,0.128312,nr.employed,0.157779,nr.employed,0.182015,euribor3m,0.146327,nr.employed,0.141536,emp.var.rate,0.125462,euribor3m,0.133896,nr.employed,0.153424
2,emp.var.rate,0.132328,emp.var.rate,0.130583,emp.var.rate,0.119435,euribor3m,0.108097,euribor3m,0.105906,nr.employed,0.091755,emp.var.rate,0.124653,euribor3m,0.123736,nr.employed,0.12539,euribor3m,0.147359
3,euribor3m,0.118986,euribor3m,0.103376,euribor3m,0.101825,pdays,0.089523,emp.var.rate,0.075492,pdays,0.090582,euribor3m,0.089779,nr.employed,0.103634,emp.var.rate,0.12437,emp.var.rate,0.113032
4,pdays,0.078129,cons.conf.idx,0.080187,pdays,0.07751,emp.var.rate,0.078808,pdays,0.074169,emp.var.rate,0.089587,pdays,0.077862,cons.conf.idx,0.078057,cons.price.idx,0.068022,cons.conf.idx,0.1021


<blockquote>Enter specific columns obtained from feature importance dataframe and train the model on them.
<p>In this case we find the columns 'pdays','duration','euribor3m','cons.conf.idx','nr.employed' as the most important features</p></blockquote>

In [23]:
data=pd.read_csv('bank-additional-full.csv',sep=';')
y=Feature_engineering.transform_y(data.iloc[:,-1])
imp_cols=['pdays','duration','euribor3m','cons.conf.idx','nr.employed'] #columns obtained from feature importances
X=data.loc[:,imp_cols]
scores=random_forest_pipeline(X,y)

Mean training ROC AUC score:  0.8751391897919738  and mean testing ROC AUC score:  0.5653974169191868
Mean training accuracy score:  0.8945215592997735  and mean testing accuracy score:  0.7095628993254468
Mean training recall score:  0.8501197318007663  and mean testing recall score:  0.3793103448275862
Mean training precision score:  0.5212519618042899  and mean testing precision score:  0.2606082738092446
Mean training f1 score:  0.6456533933802951  and mean testing f1 AUC score:  0.19805887718483675


<center><b>Note:</b> We see an improvement in training score and testing score on using those specific columns</center>