# Machine Learning Engineer Nanodegree Capstone Project

# Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2, SelectKBest, SelectPercentile, RFE, RFECV
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, OneClassSVM
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, IsolationForest
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from matplotlib import pyplot as plt
import seaborn as sns
import time
%matplotlib notebook
%matplotlib inline

# Reading Dataset

In [2]:
df = pd.read_csv('bank/bank-full.csv', sep= ';')
num_col = len(list(df.columns))
pd.set_option('display.max_columns', num_col * 3)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# One Hot Encoding the categorical Features

In [3]:
col = ['job', 'contact', 'marital','education', 'poutcome', 'month', 'day']
df = pd.get_dummies(df, columns = col)

In [4]:
labels = ['housing', 'default', 'loan', 'y']
for label in labels:
    label_encoder = LabelEncoder()
    label_encoder.fit(df[label])
    df[label] = label_encoder.transform(df[label])

In [5]:
df.head()

Unnamed: 0,age,default,balance,housing,loan,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,contact_cellular,contact_telephone,contact_unknown,...,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,58,0,2143,1,0,261,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,44,0,29,1,0,151,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,33,0,2,1,1,76,1,-1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,47,0,1506,1,0,92,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,33,0,1,0,0,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,45211.0,40.936210,10.618762,18.0,33.0,39.0,48.0,95.0
default,45211.0,0.018027,0.133049,0.0,0.0,0.0,0.0,1.0
balance,45211.0,1362.272058,3044.765829,-8019.0,72.0,448.0,1428.0,102127.0
housing,45211.0,0.555838,0.496878,0.0,0.0,1.0,1.0,1.0
loan,45211.0,0.160226,0.366820,0.0,0.0,0.0,0.0,1.0
duration,45211.0,258.163080,257.527812,0.0,103.0,180.0,319.0,4918.0
campaign,45211.0,2.763841,3.098021,1.0,1.0,2.0,3.0,63.0
pdays,45211.0,40.197828,100.128746,-1.0,-1.0,-1.0,-1.0,871.0
previous,45211.0,0.580323,2.303441,0.0,0.0,0.0,0.0,275.0
y,45211.0,0.116985,0.321406,0.0,0.0,0.0,0.0,1.0


In [7]:
#df=(df-df.min())/(df.max()-df.min())
labels = df.y
scaler = MinMaxScaler()
features = pd.DataFrame(scaler.fit_transform(df.drop(['y'], axis = 1)))
#features = df.drop(['y'], axis = 1)

In [8]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,...,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77
0,0.519481,0.0,0.092259,1.0,0.0,0.05307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.337662,0.0,0.073067,1.0,0.0,0.030704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.194805,0.0,0.072822,1.0,1.0,0.015453,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.376623,0.0,0.086476,1.0,0.0,0.018707,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.194805,0.0,0.072812,0.0,0.0,0.04026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,45211.0,0.297873,0.137906,0.0,0.194805,0.272727,0.389610,1.0
1,45211.0,0.018027,0.133049,0.0,0.000000,0.000000,0.000000,1.0
2,45211.0,0.085171,0.027643,0.0,0.073457,0.076871,0.085768,1.0
3,45211.0,0.555838,0.496878,0.0,0.000000,1.000000,1.000000,1.0
4,45211.0,0.160226,0.366820,0.0,0.000000,0.000000,0.000000,1.0
5,45211.0,0.052494,0.052364,0.0,0.020943,0.036600,0.064864,1.0
6,45211.0,0.028449,0.049968,0.0,0.000000,0.016129,0.032258,1.0
7,45211.0,0.047245,0.114827,0.0,0.000000,0.000000,0.000000,1.0
8,45211.0,0.002110,0.008376,0.0,0.000000,0.000000,0.000000,1.0
9,45211.0,0.114375,0.318269,0.0,0.000000,0.000000,0.000000,1.0


In [10]:
labels.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [None]:
pd.DataFrame(features, columns = df.drop(['y'], axis = 1).columns).head()

In [20]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=1)

In [17]:
#training = pd.DataFrame(columns = ["Classifier", "Cross Validation Accuracy", "Cross Validation F1-Score", "Cross Validation AUC"])
def classifier(feature_set):
    scaler = MinMaxScaler()
    feature_set = scaler.fit_transform(feature_set)
    X_train, X_test, y_train, y_test = train_test_split(feature_set, labels, test_size=0.2, random_state=1)
    testing = pd.DataFrame(columns = ["Classifier", "Accuracy", "F1-Score", "AUC"])
    #accuracy = []
    #rng = np.random.RandomState(1)
    names = [  "Random Forest", "AdaBoost", "Logistic Regression", "DecisionTreeClassifier",
             "GaussianNB", "MultinomialNB", "MLPClassifier", "SVM-RBF"]
    #"Isolation Forest", "OneClassSVM",
    classifiers = [
        RandomForestClassifier(n_jobs = -1, random_state = 1, class_weight = "balanced"),
        AdaBoostClassifier(),
        LogisticRegression(penalty ='l2', solver ='liblinear'),
        DecisionTreeClassifier(random_state = 1, class_weight = 'balanced'),
        #IsolationForest(random_state=1),
        #OneClassSVM(kernel="rbf", gamma="auto"),
        GaussianNB(),
        MultinomialNB(),
        #KNeighborsClassifier(n_jobs = -1),
        MLPClassifier(early_stopping = True, max_iter = 100),
        #SVC(kernel = 'linear'),
        SVC(class_weight = "balanced")
        ]
    p_grid = {"GaussianNB":{},
              "Random Forest": {'n_estimators': [10, 20, 50], 'max_features': ['sqrt', 'log2'], "criterion": ["entropy", "gini"]},
              "AdaBoost": {"learning_rate": [0.1, 0.3, 1, 3], "n_estimators" : [50, 100, 500]},
              "Logistic Regression": {"max_iter": [50, 200, 500]},
              "DecisionTreeClassifier": {"max_depth": [5, 15, 25], "criterion": ["entropy", "gini"]}, 
              #"Isolation Forest": {"contamination": [0.01, 0.1, 0.5]},
              #"OneClassSVM": {"nu" : [0.01, 0.03, 0.1, 0.3]},
              "MultinomialNB": {"alpha" : [0, 1, 2]}, 
              #"KNeighborsClassifier":{"n_neighbors" : [1, 5, 11], "algorithm" : ['ball_tree', 'kd_tree']}, 
              "MLPClassifier":{"hidden_layer_sizes" : [(8, 16, 8), (32, 64, 32)], "activation": ['logistic', 'relu'], "solver" : ['sgd', 'adam']},
              #"SVM": {"gamma":[0.1,1], "C":[1, 10]},
              "SVM-RBF": {"kernel": ["poly", "rbf"], "C":[0.1, 1, 5]}          
             }

    for name, classifier in zip(names, classifiers):
        start = time.time()
        cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = 1)
        clf = GridSearchCV(estimator = classifier, param_grid = p_grid[name], cv = cv, n_jobs = -1, scoring = "roc_auc")
        clf.fit(X_train, y_train)

        #acc = np.mean(cross_val_score(clf, X_train, y_train, scoring = "accuracy", cv = cv))
        #f1 = np.mean(cross_val_score(clf, X_train, y_train, scoring = "f1", cv = cv))
        #auc = np.mean(cross_val_score(clf, X_train, y_train, scoring = "roc_auc", cv = cv))

        #training.loc[-1] = [name, acc, f1, auc]
        #training.index = training.index + 1
        #training = training.sort_index()
        print(clf.best_score_)
        """
        if name == "OneClassSVM" or name == "Isolation Forest":
            predicted_y = clf.predict(X_test)
            y = [0]*len(predicted_y)
            for i in range(len(predicted_y)):
                if predicted_y[i] == -1.0:
                    y[i] = 1
            acc = accuracy_score(y_test, y)
            f1 = f1_score(y_test, y)
            auc = roc_auc_score(y_test, y)

            testing.loc[-1] = [name, acc, f1, auc]
            testing.index = testing.index + 1
            testing = testing.sort_index()
            print("-")
            print(name + " took " + str(time.time() - start) + " seconds.")

            continue
        """
        acc = accuracy_score(y_test, clf.predict(X_test))
        f1 = f1_score(y_test, clf.predict(X_test))
        auc = roc_auc_score(y_test, clf.predict(X_test))

        testing.loc[-1] = [name, acc, f1, auc]
        testing.index = testing.index + 1
        testing = testing.sort_index()
        print(name + " took " + str(time.time() - start) + " seconds.")
    return testing

In [46]:
testing

Unnamed: 0,Classifier,Accuracy,F1-Score,AUC
0,SVM-RBF,0.853699,0.577721,0.857263
1,MLPClassifier,0.903904,0.520155,0.706145
2,MultinomialNB,0.886321,0.37697,0.630017
3,GaussianNB,0.837443,0.435917,0.708672
4,DecisionTreeClassifier,0.814221,0.494281,0.800185
5,Logistic Regression,0.900365,0.424281,0.646648
6,AdaBoost,0.901139,0.462094,0.668595
7,Random Forest,0.902245,0.419948,0.642748


# Feature Selection

In [51]:
pvalues = chi2(features, labels)[1]

print("Features with their respective pvalues : ")
for i in range(len(pvalues)):
    print(df.columns[i], pvalues[i])
k_best_features = pd.DataFrame(SelectKBest(chi2, k=25).fit_transform(features, labels))

Selected Features with pvalues greater than threshold : 
age
default
balance
campaign
previous
y
job_blue-collar
job_entrepreneur
job_housemaid
job_retired
job_self-employed
job_student
job_technician
job_unemployed
contact_cellular
contact_unknown
marital_divorced
marital_married
marital_single
education_primary
education_tertiary
education_unknown
poutcome_failure
month_apr
month_dec
month_feb
month_jan
month_jul
month_may
month_sep
day_1
day_2
day_3
day_4
day_5
day_6
day_7
day_8
day_9
day_10
day_11
day_12
day_13
day_14
day_15
day_16
day_17
day_18
day_19
day_20
day_21
day_22
day_23
day_24
day_25
day_26
day_27
day_28
day_29
day_30


In [53]:
k_best_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,1.0,0.0,0.05307,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.030704,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.015453,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.018707,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.04026,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [58]:
testing = classifier(k_best_features)

0.870274532397
Random Forest took 13.832303762435913 seconds.
0.907855139157
AdaBoost took 61.22598385810852 seconds.
0.902515358648
Logistic Regression took 4.266242504119873 seconds.
0.875501495576
DecisionTreeClassifier took 3.917907953262329 seconds.
0.823499262593
GaussianNB took 2.317765712738037 seconds.
0.7635218991
MultinomialNB took 3.3285582065582275 seconds.
0.909938840685
MLPClassifier took 14.221762418746948 seconds.
0.911918843835
SVM-RBF took 1066.9422872066498 seconds.


In [59]:
testing

Unnamed: 0,Classifier,Accuracy,F1-Score,AUC
0,SVM-RBF,0.840761,0.551122,0.841258
1,MLPClassifier,0.898596,0.439829,0.657229
2,MultinomialNB,0.887648,0.367372,0.624149
3,GaussianNB,0.866969,0.438113,0.684424
4,DecisionTreeClassifier,0.814995,0.501638,0.80931
5,Logistic Regression,0.899259,0.411879,0.640645
6,AdaBoost,0.901802,0.459196,0.666075
7,Random Forest,0.887427,0.480612,0.696824


In [11]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 15)
pca_features = pca.fit_transform(features)

In [12]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,...,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77
0,0.519481,0.0,0.092259,1.0,0.0,0.05307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.337662,0.0,0.073067,1.0,0.0,0.030704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.194805,0.0,0.072822,1.0,1.0,0.015453,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.376623,0.0,0.086476,1.0,0.0,0.018707,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.194805,0.0,0.072812,0.0,0.0,0.04026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
pca_features[0]

array([-0.53865932,  1.47342688, -0.35651345,  0.47844058, -0.75567369,
       -0.07831359, -0.10157215, -0.11878161, -0.21054658,  0.05404749,
       -0.01818127, -0.13697415, -0.02002092, -0.00955932, -0.0155576 ])

In [18]:
testing = classifier(pca_features)

0.721059313601
Random Forest took 43.03697919845581 seconds.
0.735936735324
AdaBoost took 191.69104862213135 seconds.
0.714872310969
Logistic Regression took 4.420344591140747 seconds.
0.721721997649
DecisionTreeClassifier took 7.4167845249176025 seconds.
0.720553100813
GaussianNB took 2.4842073917388916 seconds.
0.709700650626
MultinomialNB took 3.3328890800476074 seconds.


  'precision', 'predicted', average, warn_for)


0.67776572991
MLPClassifier took 11.646566390991211 seconds.


  'precision', 'predicted', average, warn_for)


0.743611310205
SVM-RBF took 870.3438429832458 seconds.


In [19]:
testing

Unnamed: 0,Classifier,Accuracy,F1-Score,AUC
0,SVM-RBF,0.748977,0.376374,0.707024
1,MLPClassifier,0.883888,0.0,0.5
2,MultinomialNB,0.883888,0.0,0.5
3,GaussianNB,0.877917,0.108239,0.524336
4,DecisionTreeClassifier,0.782926,0.371035,0.682383
5,Logistic Regression,0.883778,0.0,0.499937
6,AdaBoost,0.8861,0.078712,0.519451
7,Random Forest,0.875484,0.270725,0.581696
