In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.model_selection import KFold

__Naive Bayes CLassifier: w/o any modification of missing data__

In [2]:
df = pd.read_csv('bank-data/bank-additional-full.csv')

bank_object_data = df.select_dtypes(include="object")
bank_object_data.info()

bank_non_object_data = df.select_dtypes(exclude="object")
print(bank_non_object_data.info())

label = LabelEncoder()
bank_object_data = bank_object_data.apply(label.fit_transform)

print(bank_object_data.head())

bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)
print(bank_final.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
job            41188 non-null object
marital        41188 non-null object
education      41188 non-null object
default        41188 non-null object
housing        41188 non-null object
loan           41188 non-null object
contact        41188 non-null object
month          41188 non-null object
day_of_week    41188 non-null object
poutcome       41188 non-null object
y              41188 non-null object
dtypes: object(11)
memory usage: 3.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 10 columns):
age               41188 non-null int64
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m       

__Naive Bayes CLassifier: w/o any modification of missing data; k fold and shuffle__

In [3]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))

Accuracy: 84.65%
Accuracy: 0.85 (+/- 0.01)


__Naive Bayes CLassifier: w/o any modification of missing data; k fold and no shuffle__

In [18]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 83.35%
Accuracy: 0.83 (+/- 0.34)


array([0.97353727, 0.96649672, 0.96042729, 0.93614955, 0.94391843,
       0.94561787, 0.84219471, 0.62709395, 0.64011656, 0.49902865])

__Naive Bayes CLassifier: w/o any modification of missing data; stratified k fold__

In [16]:
from sklearn.model_selection import StratifiedKFold
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

skfold = StratifiedKFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=skfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 83.84%
Accuracy: 0.84 (+/- 0.01)


array([0.8315125 , 0.83515416, 0.84365137, 0.83733916, 0.83855305,
       0.83224084, 0.84340859, 0.84365137, 0.83341428, 0.8455561 ])

__Naive Bayes CLassifier: deleting missing data__

In [19]:
df = pd.read_csv('bank-data/bank-additional-full.csv')
print(df.shape)
print(df.columns)
print((df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] == 'unknown').sum())

df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']].replace('unknown', np.NaN)
print(df.isnull().sum())

df.dropna(inplace=True)
# summarize the number of rows and columns in the dataset
print(df.shape)
print(df.isnull().sum())

(41188, 21)
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month        

  result = method(y)


age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


In [20]:
bank_object_data = df.select_dtypes(include="object")
print(bank_object_data.info())

bank_non_object_data = df.select_dtypes(exclude="object")
print(bank_non_object_data.info())

label = LabelEncoder()
bank_object_data = bank_object_data.apply(label.fit_transform)

print(bank_object_data.head())

bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)

print(bank_final.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30488 entries, 0 to 41187
Data columns (total 11 columns):
job            30488 non-null object
marital        30488 non-null object
education      30488 non-null object
default        30488 non-null object
housing        30488 non-null object
loan           30488 non-null object
contact        30488 non-null object
month          30488 non-null object
day_of_week    30488 non-null object
poutcome       30488 non-null object
y              30488 non-null object
dtypes: object(11)
memory usage: 2.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30488 entries, 0 to 41187
Data columns (total 10 columns):
age               30488 non-null int64
duration          30488 non-null int64
campaign          30488 non-null int64
pdays             30488 non-null int64
previous          30488 non-null int64
emp.var.rate      30488 non-null float64
cons.price.idx    30488 non-null float64
cons.conf.idx     30488 non-null float64
euribor3m  

__Naive Bayes CLassifier: w/o any modification of missing data; k fold and shuffle__

In [23]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 82.58%
Accuracy: 0.83 (+/- 0.02)


array([0.83437193, 0.81600525, 0.82289275, 0.84388324, 0.82322073,
       0.82584454, 0.82322073, 0.83043621, 0.82217848, 0.81594488])

__Naive Bayes CLassifier: w/o any modification of missing data; k fold and no shuffle__

In [24]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 82.05%
Accuracy: 0.82 (+/- 0.33)


array([0.97408987, 0.96228272, 0.95080354, 0.93538865, 0.94522794,
       0.93998032, 0.71892424, 0.63463431, 0.60498688, 0.53838583])

__Naive Bayes CLassifier: w/o any modification of missing data; stratified k fold and shuffle__

In [25]:
from sklearn.model_selection import StratifiedKFold
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

skfold = StratifiedKFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=skfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 82.58%
Accuracy: 0.83 (+/- 0.01)


array([0.82551656, 0.82059692, 0.83043621, 0.82256478, 0.82650049,
       0.8278124 , 0.82551656, 0.81633322, 0.82972441, 0.83333333])

__Naive Bayes CLassifier: with imputation__

In [2]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

df = pd.read_csv('bank-data/bank-additional-full.csv')
print(df.shape)
print(df.columns)
print((df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] == 'unknown').sum())

df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']].replace('unknown', np.NaN)
print(df.isnull().sum())


(41188, 21)
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month        

  result = method(y)


In [3]:
bank_object_data = df.select_dtypes(include="object")
print(bank_object_data.info())

bank_non_object_data = df.select_dtypes(exclude="object")
print(bank_non_object_data.info())

values = bank_non_object_data.values
#imputer = Imputer()
transformed_values = imputer.fit_transform(values)
# count the number of NaN values in each column
print(np.isnan(transformed_values).sum())
print(np.isnan(bank_non_object_data.values).sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 11 columns):
job            40858 non-null object
marital        41108 non-null object
education      39457 non-null object
default        32591 non-null object
housing        40198 non-null object
loan           40198 non-null object
contact        41188 non-null object
month          41188 non-null object
day_of_week    41188 non-null object
poutcome       41188 non-null object
y              41188 non-null object
dtypes: object(11)
memory usage: 3.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 10 columns):
age               41188 non-null int64
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m  

In [4]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


bank_object_data = DataFrameImputer().fit_transform(bank_object_data)

In [5]:
label = LabelEncoder()
bank_object_data = bank_object_data.apply(label.fit_transform)

print(bank_object_data.head())

bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)

print(bank_final.head())

   job  marital  education  default  housing  loan  contact  month  \
0    3        1          0        0        0     0        1      6   
1    7        1          3        0        0     0        1      6   
2    7        1          3        0        1     0        1      6   
3    0        1          1        0        0     0        1      6   
4    7        1          3        0        0     1        1      6   

   day_of_week  poutcome  y  
0            1         1  0  
1            1         1  0  
2            1         1  0  
3            1         1  0  
4            1         1  0  
   job  marital  education  default  housing  loan  contact  month  \
0    3        1          0        0        0     0        1      6   
1    7        1          3        0        0     0        1      6   
2    7        1          3        0        1     0        1      6   
3    0        1          1        0        0     0        1      6   
4    7        1          3        0        0     

In [6]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Accuracy: 83.85%
Accuracy: 0.84 (+/- 0.01)


array([0.82592862, 0.83976693, 0.84340859, 0.84292304, 0.83515416,
       0.83806749, 0.83199806, 0.84316582, 0.83997086, 0.84458475])

In [7]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, scoring = 'precision', cv=kfold)
print("Precision: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Precision: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Precision: 36.84%
Precision: 0.37 (+/- 0.04)


array([0.33969466, 0.34710744, 0.39164491, 0.38845144, 0.38101266,
       0.37820513, 0.36170213, 0.38263229, 0.33914209, 0.37400531])

In [8]:
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, scoring = 'recall', cv=kfold)
print("Recall: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Recall: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Recall: 60.68%
Recall: 0.61 (+/- 0.04)


array([0.57419355, 0.57534247, 0.6263048 , 0.62054507, 0.61303462,
       0.61844864, 0.61358811, 0.5961945 , 0.60381862, 0.62666667])

In [9]:
from sklearn.model_selection import StratifiedKFold
X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()

skfold = StratifiedKFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_val_score(model, X, Y, scoring='recall', cv=skfold)
print("Recall: %.2f%%" % (results_kfold.mean()*100.0)) 
print("Recall: %0.2f (+/- %0.2f)" % (results_kfold.mean(), results_kfold.std() * 2))
results_kfold

Recall: 60.58%
Recall: 0.61 (+/- 0.03)


array([0.60344828, 0.58405172, 0.59482759, 0.62931034, 0.60775862,
       0.61422414, 0.61206897, 0.62068966, 0.57758621, 0.61422414])

In [1]:
import matplotlib.pylab as plt
import matplotlib.patches as patches
from sklearn.metrics import roc_curve,auc
from scipy import interp

In [None]:
X_train_res = bank_final.drop(['y'], axis = 1)
y_train_res = bank_final['y']

cv = KFold(n_splits=10, random_state=100, shuffle = True)
classifier = GaussianNB()

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(10,10))
i = 1
for train, test in cv.split(X_train_res, y_train_res):
    probas_ = classifier.fit(X_train_res.iloc[train], y_train_res.iloc[train]).predict_proba(X_train_res.iloc[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_train_res[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate',fontsize=18)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Cross-Validation ROC of Naive Bayes',fontsize=14)
plt.legend(loc="lower right", prop={'size': 10})
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold

X_train_res = bank_final.drop(['y'], axis = 1)
y_train_res = bank_final['y']

cv = StratifiedKFold(n_splits=10, random_state=100, shuffle = True)
classifier = GaussianNB()
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(10,10))
i = 1
for train, test in cv.split(X_train_res, y_train_res):
    probas_ = classifier.fit(X_train_res.iloc[train], y_train_res.iloc[train]).predict_proba(X_train_res.iloc[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_train_res[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate',fontsize=18)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Cross-Validation ROC of Decision Tree',fontsize=14)
plt.legend(loc="lower right", prop={'size': 10})
plt.show()

In [23]:
import pandas as pd
import numpy as np
import statistics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score
from sklearn import model_selection
from sklearn.base import TransformerMixin
from sklearn.model_selection import KFold
import matplotlib.pylab as plt
import matplotlib.patches as patches
from sklearn.metrics import roc_curve,auc
from numpy import interp
from joblib import dump
import os
import joblib

In [24]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


def preprocess(df):
    df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']] = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']].replace('unknown', np.NaN)
    bank_object_data = df.select_dtypes(include="object")
    bank_non_object_data = df.select_dtypes(exclude="object")
    bank_object_data = DataFrameImputer().fit_transform(bank_object_data)
    label = LabelEncoder()
    bank_object_data = bank_object_data.apply(label.fit_transform)
    bank_final = pd.concat([bank_object_data, bank_non_object_data], axis = 1)
    return(bank_final)

In [25]:
df = pd.read_csv('bank-data/bank-additional-full.csv')
bank_final = preprocess(df)

In [28]:
bank_final['y']
count = 0
for i in bank_final.index:
    if bank_final['y'][i] == 1:
        count += 1
print(count/len(bank_final))
print(1 - count/len(bank_final))
print(count, len(bank_final) - count, len(bank_final))

0.11265417111780131
0.8873458288821987
4640 36548 41188


In [4]:
import timeit

start = timeit.default_timer()
from datetime import datetime

# current date and time
begin = datetime.now()


X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = GaussianNB()
scoring = ['accuracy', 'recall', 'precision', 'f1', 'roc_auc']

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_validate(model, X, Y, scoring=scoring, cv=kfold)
print("Recall: %0.2f (+/- %0.2f)" % (results_kfold['test_recall'].mean(), results_kfold['test_recall'].std()))
print("Precision: %0.2f (+/- %0.2f)" % (results_kfold['test_precision'].mean(), results_kfold['test_precision'].std()))
print("F1 Score: %0.2f (+/- %0.2f)" % (results_kfold['test_f1'].mean(), results_kfold['test_f1'].std()))
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold['test_accuracy'].mean(), results_kfold['test_accuracy'].std()))
print("ROC_AUC: %0.2f (+/- %0.2f)" % (results_kfold['test_roc_auc'].mean(), results_kfold['test_roc_auc'].std()))

print(results_kfold)
stop = timeit.default_timer()
end = datetime.now()
print('Start Time: ', begin,
      'Stop Time: ', end,
      'Time Taken: ', stop - start)

Recall: 0.61 (+/- 0.02)
Precision: 0.37 (+/- 0.02)
F1 Score: 0.46 (+/- 0.02)
Accuracy: 0.84 (+/- 0.01)
ROC_AUC: 0.86 (+/- 0.01)
{'fit_time': array([0.07039976, 0.04965115, 0.02400637, 0.02688217, 0.02620602,
       0.02402973, 0.02407336, 0.02411222, 0.02218795, 0.02726746]), 'score_time': array([0.04971814, 0.01660681, 0.01203656, 0.00806952, 0.01198316,
       0.01201344, 0.01202869, 0.01205182, 0.01369023, 0.01286173]), 'test_accuracy': array([0.82592862, 0.83976693, 0.84340859, 0.84292304, 0.83515416,
       0.83806749, 0.83199806, 0.84316582, 0.83997086, 0.84458475]), 'test_recall': array([0.57419355, 0.57534247, 0.6263048 , 0.62054507, 0.61303462,
       0.61844864, 0.61358811, 0.5961945 , 0.60381862, 0.62666667]), 'test_precision': array([0.33969466, 0.34710744, 0.39164491, 0.38845144, 0.38101266,
       0.37820513, 0.36170213, 0.38263229, 0.33914209, 0.37400531]), 'test_f1': array([0.42685851, 0.43298969, 0.48192771, 0.47780468, 0.46994536,
       0.46937152, 0.45511811, 0.4661

In [14]:
import timeit

start = timeit.default_timer()
from datetime import datetime

# current date and time
begin = datetime.now()


X = bank_final.drop(['y'], axis = 1)
Y = bank_final['y']

model = ComplementNB()
scoring = ['accuracy', 'recall', 'precision', 'f1', 'roc_auc']

kfold = KFold(n_splits=10, random_state=100, shuffle = True)
results_kfold = model_selection.cross_validate(model, X, Y, scoring=scoring, cv=kfold)
print("Recall: %0.2f (+/- %0.2f)" % (results_kfold['test_recall'].mean(), results_kfold['test_recall'].std()))
print("Precision: %0.2f (+/- %0.2f)" % (results_kfold['test_precision'].mean(), results_kfold['test_precision'].std()))
print("F1 Score: %0.2f (+/- %0.2f)" % (results_kfold['test_f1'].mean(), results_kfold['test_f1'].std()))
print("Accuracy: %0.2f (+/- %0.2f)" % (results_kfold['test_accuracy'].mean(), results_kfold['test_accuracy'].std()))
print("ROC_AUC: %0.2f (+/- %0.2f)" % (results_kfold['test_roc_auc'].mean(), results_kfold['test_roc_auc'].std()))

print(results_kfold)
stop = timeit.default_timer()
end = datetime.now()
print('Start Time: ', begin,
      'Stop Time: ', end,
      'Time Taken: ', stop - start)

Recall: nan (+/- nan)
Precision: nan (+/- nan)
F1 Score: nan (+/- nan)
Accuracy: nan (+/- nan)
ROC_AUC: nan (+/- nan)
{'fit_time': array([0.01657534, 0.01475739, 0.01486588, 0.0147202 , 0.01314211,
       0.01012444, 0.00904322, 0.01173854, 0.00903773, 0.01012301]), 'score_time': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'test_accuracy': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_recall': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_precision': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_f1': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]), 'test_roc_auc': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])}
Start Time:  2020-04-21 07:53:37.741564 Stop Time:  2020-04-21 07:53:37.916564 Time Taken:  0.1725308000000041


ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

ValueError: Negative values in data passed to ComplementNB (input X)

