In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.model_selection import KFold

__Naive Bayes CLassifier: w/o any modification of missing data__

In [2]:
df = pd.read_csv('bank-data/bank-additional-full.csv')

bank_object_data = df.select_dtypes(include="object")
bank_object_data.info()

bank_non_object_data = df.select_dtypes(exclude="object")
print(bank_non_object_data.info())

label = LabelEncoder()
bank_object_data = bank_object_data.apply(label.fit_transform)

print(bank_object_data.head())

bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)
print(bank_final.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
job            41188 non-null object
marital        41188 non-null object
education      41188 non-null object
default        41188 non-null object
housing        41188 non-null object
loan           41188 non-null object
contact        41188 non-null object
month          41188 non-null object
day_of_week    41188 non-null object
poutcome       41188 non-null object
y              41188 non-null object
dtypes: object(11)
memory usage: 3.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 10 columns):
age               41188 non-null int64
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m       

__Naive Bayes CLassifier: w/o any modification of missing data; k fold and shuffle__

In [3]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))

Accuracy: 84.65%
Accuracy: 0.85 (+/- 0.01)


__Naive Bayes CLassifier: w/o any modification of missing data; k fold and no shuffle__

In [18]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 83.35%
Accuracy: 0.83 (+/- 0.34)


array([0.97353727, 0.96649672, 0.96042729, 0.93614955, 0.94391843,
       0.94561787, 0.84219471, 0.62709395, 0.64011656, 0.49902865])

__Naive Bayes CLassifier: w/o any modification of missing data; stratified k fold__

In [16]:
from sklearn.model_selection import StratifiedKFold
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

skfold = StratifiedKFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=skfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 83.84%
Accuracy: 0.84 (+/- 0.01)


array([0.8315125 , 0.83515416, 0.84365137, 0.83733916, 0.83855305,
       0.83224084, 0.84340859, 0.84365137, 0.83341428, 0.8455561 ])

__Naive Bayes CLassifier: deleting missing data__

In [19]:
df = pd.read_csv('bank-data/bank-additional-full.csv')
print(df.shape)
print(df.columns)
print((df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] == 'unknown').sum())

df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']].replace('unknown', np.NaN)
print(df.isnull().sum())

df.dropna(inplace=True)
# summarize the number of rows and columns in the dataset
print(df.shape)
print(df.isnull().sum())

(41188, 21)
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month        

  result = method(y)


age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


In [20]:
bank_object_data = df.select_dtypes(include="object")
print(bank_object_data.info())

bank_non_object_data = df.select_dtypes(exclude="object")
print(bank_non_object_data.info())

label = LabelEncoder()
bank_object_data = bank_object_data.apply(label.fit_transform)

print(bank_object_data.head())

bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)

print(bank_final.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30488 entries, 0 to 41187
Data columns (total 11 columns):
job            30488 non-null object
marital        30488 non-null object
education      30488 non-null object
default        30488 non-null object
housing        30488 non-null object
loan           30488 non-null object
contact        30488 non-null object
month          30488 non-null object
day_of_week    30488 non-null object
poutcome       30488 non-null object
y              30488 non-null object
dtypes: object(11)
memory usage: 2.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30488 entries, 0 to 41187
Data columns (total 10 columns):
age               30488 non-null int64
duration          30488 non-null int64
campaign          30488 non-null int64
pdays             30488 non-null int64
previous          30488 non-null int64
emp.var.rate      30488 non-null float64
cons.price.idx    30488 non-null float64
cons.conf.idx     30488 non-null float64
euribor3m  

__Naive Bayes CLassifier: w/o any modification of missing data; k fold and shuffle__

In [23]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 82.58%
Accuracy: 0.83 (+/- 0.02)


array([0.83437193, 0.81600525, 0.82289275, 0.84388324, 0.82322073,
       0.82584454, 0.82322073, 0.83043621, 0.82217848, 0.81594488])

__Naive Bayes CLassifier: w/o any modification of missing data; k fold and no shuffle__

In [24]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 82.05%
Accuracy: 0.82 (+/- 0.33)


array([0.97408987, 0.96228272, 0.95080354, 0.93538865, 0.94522794,
       0.93998032, 0.71892424, 0.63463431, 0.60498688, 0.53838583])

__Naive Bayes CLassifier: w/o any modification of missing data; stratified k fold and shuffle__

In [25]:
from sklearn.model_selection import StratifiedKFold
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

skfold = StratifiedKFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=skfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 82.58%
Accuracy: 0.83 (+/- 0.01)


array([0.82551656, 0.82059692, 0.83043621, 0.82256478, 0.82650049,
       0.8278124 , 0.82551656, 0.81633322, 0.82972441, 0.83333333])

__Naive Bayes CLassifier: with imputation__

In [7]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

df = pd.read_csv('bank-data/bank-additional-full.csv')
print(df.shape)
print(df.columns)
print((df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] == 'unknown').sum())

df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']].replace('unknown', np.NaN)
print(df.isnull().sum())


(41188, 21)
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64


  result = method(y)


age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64


In [8]:
bank_object_data = df.select_dtypes(include="object")
print(bank_object_data.info())

bank_non_object_data = df.select_dtypes(exclude="object")
print(bank_non_object_data.info())

values = bank_non_object_data.values
#imputer = Imputer()
transformed_values = imputer.fit_transform(values)
# count the number of NaN values in each column
print(np.isnan(transformed_values).sum())
print(np.isnan(bank_non_object_data.values).sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
job            40858 non-null object
marital        41108 non-null object
education      39457 non-null object
default        32591 non-null object
housing        40198 non-null object
loan           40198 non-null object
contact        41188 non-null object
month          41188 non-null object
day_of_week    41188 non-null object
poutcome       41188 non-null object
y              41188 non-null object
dtypes: object(11)
memory usage: 3.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 10 columns):
age               41188 non-null int64
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m  

In [9]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


bank_object_data = DataFrameImputer().fit_transform(bank_object_data)

In [10]:
label = LabelEncoder()
bank_object_data = bank_object_data.apply(label.fit_transform)

print(bank_object_data.head())

bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)

print(bank_final.head())

   job  marital  education  default  housing  loan  contact  month  \
0    3        1          0        0        0     0        1      6   
1    7        1          3        0        0     0        1      6   
2    7        1          3        0        1     0        1      6   
3    0        1          1        0        0     0        1      6   
4    7        1          3        0        0     1        1      6   

   day_of_week  poutcome  y  
0            1         1  0  
1            1         1  0  
2            1         1  0  
3            1         1  0  
4            1         1  0  
   job  marital  education  default  housing  loan  contact  month  \
0    3        1          0        0        0     0        1      6   
1    7        1          3        0        0     0        1      6   
2    7        1          3        0        1     0        1      6   
3    0        1          1        0        0     0        1      6   
4    7        1          3        0        0     

In [14]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 83.85%
Accuracy: 0.84 (+/- 0.01)


array([0.82592862, 0.83976693, 0.84340859, 0.84292304, 0.83515416,
       0.83806749, 0.83199806, 0.84316582, 0.83997086, 0.84458475])