## Import Libraries

In [1]:
from IPython.display import Image
from os import system
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
# from sklearn.feature_extraction.text import CountVectorizer  #DT does not take strings as input for the model fit step....
#import pydotplus as pydot

KeyboardInterrupt: 

In [2]:
import pandas as pd

In [3]:
bankdata = pd.read_csv("bank_full.csv")
pd.set_option('display.max_columns', None)
bankdata.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'bank_full.csv'

In [None]:
bankdata.shape

In [None]:
bankdata.describe()

In [None]:
bankdata.info()  # many columns are of type object i.e. strings. These need to be converted to ordinal type

Lets convert the columns with an 'object' datatype into categorical variables

In [None]:
for feature in bankdata.columns:  # Loop through all columns in the dataframe
    if bankdata[feature].dtype == 'object':  # Only apply for columns with categorical strings
        # Replace strings with an integer
        bankdata[feature] = pd.Categorical(bankdata[feature])
bankdata.head(10)

In [None]:
print(bankdata.job.value_counts())
print(bankdata.marital.value_counts())
print(bankdata.education.value_counts())
print(bankdata.default.value_counts())
print(bankdata.housing.value_counts())
print(bankdata.loan.value_counts())
print(bankdata.contact.value_counts())
print(bankdata.month.value_counts())
print(bankdata.poutcome.value_counts())
print(bankdata.Target.value_counts())

In [None]:
replaceStruct = {
    "education": {"primary": 0, "secondary": 1, "tertiary": 2, "unknown": -1},
    "poutcome": {"failure": -1, "unknown": 1, "other": 2, "success": 3},
    "loan": {"no": 0, "yes": 1},
    "contact":     {"unknown": 0, "telephone": 1, "cellular": 2},
    "default":     {"no": 1, "yes": 2},
    "housing":     {"no": 1, "yes": 2},
    "Target":     {"no": 0, "yes": 1}
}
oneHotCols = ["marital", "job", "month"]

In [None]:
bankdata = bankdata.replace(replaceStruct)
bankdata = pd.get_dummies(bankdata, columns=oneHotCols)
bankdata.head(10)

In [None]:
bankdata.info()

## Split Data

In [None]:
X = bankdata.drop("Target", axis=1)
y = bankdata.pop("Target")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)

## Build Decision Tree Model

We will build our model using the DecisionTreeClassifier function. Using default 'gini' criteria to split. Other option include 'entropy'.  

In [None]:
deciTree = DecisionTreeClassifier(criterion='gini', random_state=1)
deciTree = deciTree.fit(X_train, y_train)

## Scoring our Decision Tree

In [None]:
print(deciTree.score(X_train, y_train))
print(deciTree.score(X_test, y_test))

## Visualizing the Decision Tree

In [None]:
tree.plot_tree(deciTree)

tree.export_graphviz outputs a .dot file. This is a text file that describes a graph structure using a specific structure. You can plot this by

1. pasting the contents of that file at  http://webgraphviz.com/ (or)
2. generate a image file using the 'dot' command (this will only work if you have graphviz installed on your machine)


## Reducing over fitting (Regularization)

In [None]:
dTreeR = DecisionTreeClassifier(criterion='gini', max_depth=6, random_state=1)
dTreeR_model = dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))

In [None]:
import graphviz
dot_data = tree.export_graphviz(dTreeR,out_file=None)
graph = graphviz.Source(dot_data)
graph

In [None]:
# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )

print (pd.DataFrame(dTreeR.feature_importances_, columns = ["Imp"], index = X_train.columns))

In [None]:
print(dTreeR.score(X_test , y_test))
y_predict = dTreeR.predict(X_test)

cm=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')


#                             Ensemble Learning - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

bgcl = BaggingClassifier(base_estimator=deciTree, n_estimators=20,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)

bgcl = bgcl.fit(X_train, y_train)


In [None]:
y_predict = bgcl.predict(X_test)

print(bgcl.score(X_test , y_test))

cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')

# Ensemble Learning - AdaBoosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(X_train, y_train)


In [None]:
y_predict = abcl.predict(X_test)
print(abcl.score(X_test , y_test))

cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')



#                     Ensemble Learning - GradientBoost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(X_train, y_train)

In [None]:
y_predict = gbcl.predict(X_test)
print(gbcl.score(X_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')

# Ensemble RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(X_train, y_train)


In [None]:
y_predict = rfcl.predict(X_test)
print(rfcl.score(X_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])

df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
                  columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')

## Naive Bayes

In [None]:
bankdata = pd.read_csv("bank_full.csv")
pd.set_option('display.max_columns', None)
bankdata.head(10)

In [None]:
bankdata.corr() # It will show correlation matrix 

In [None]:
# However we want to see correlation in graphical representation so below is function for that
def plot_corr(df, size=11):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)
    

In [None]:
plot_corr(bankdata)

In [None]:
for feature in bankdata.columns: # Loop through all columns in the dataframe
    if bankdata[feature].dtype == 'object': # Only apply for columns with categorical strings
        bankdata[feature] = pd.Categorical(bankdata[feature])# Replace strings with an integer
bankdata.head(10)

In [None]:
replaceStruct = {
                "education" :{ "primary" :0,"secondary":1, "tertiary" :2,"unknown" :-1},
                 "poutcome": {"failure": -1, "unknown":1 , "other": 2, "success": 3},
                 "loan": {"no": 0, "yes":1},
                 "contact":     {"unknown": 0, "telephone": 1 ,"cellular": 2 },
                "default":     {"no": 1, "yes": 2 },
    "housing":     {"no": 1, "yes": 2 },
                "Target":     {"no": 0, "yes": 1 } 
                    }
oneHotCols=["marital","job","month"]


In [None]:
bankdata=bankdata.replace(replaceStruct)
bankdata=pd.get_dummies(bankdata, columns=oneHotCols)
bankdata.head(10)

In above plot yellow colour represents maximum correlation and blue colour represents minimum correlation.
We can see none of variable have correlation with any other variables.

In [None]:
from sklearn.model_selection import train_test_split

X = bankdata.drop('Target',axis=1)     # Predictor feature columns (8 X m)

Y = bankdata['Target']   # Predicted class (1=True, 0=False) (1 X m)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number

x_train.head()

In [None]:
print("Original Target True Values    : {0} ({1:0.2f}%)".format(len(bankdata.loc[bankdata['Target'] == 1]), (len(bankdata.loc[bankdata['Target'] == 1])/len(bankdata.index)) * 100))
print("Original Target False Values   : {0} ({1:0.2f}%)".format(len(bankdata.loc[bankdata['Target'] == 0]), (len(bankdata.loc[bankdata['Target'] == 0])/len(bankdata.index)) * 100))

In [None]:
from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes

# creatw the model
pl_model= GaussianNB()

pl_model.fit(x_train, y_train.ravel())

In [None]:
pl_train_predict = pl_model.predict(x_train)

from sklearn import metrics

print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, pl_train_predict)))
print()

In [None]:
pl_test_predict = pl_model.predict(x_test)

from sklearn import metrics

print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pl_test_predict)))
print()

### Lets check the confusion matrix and classification report 

In [None]:
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test, pl_test_predict, labels=[1, 0])
print(cm)

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

In [None]:
print("Classification Report")
print(metrics.classification_report(y_test, pl_test_predict, labels=[1, 0]))

In [None]:
import pylab as pl
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc
random_state = np.random.RandomState(0)

In [None]:
classifier2 = GaussianNB()
probas1_ = classifier2.fit(x_train, y_train).predict_proba(x_test)
# Compute ROC curve and area the curve for logistic
fpr1, tpr1, thresholds1 = roc_curve(y_test, probas1_[:, 1])
roc_auc1 = auc(fpr1, tpr1)
print("Area under the ROC curve : %f" % roc_auc1)
print('Gini Coeffiecient=',2*(roc_auc1)-1)

In [None]:
# Plot ROC curve
pl.clf()
pl.plot(fpr1, tpr1, label='ROC curve for logistic (area = %0.2f)' % roc_auc1)

pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiverrating characteristic example')
pl.legend(loc="lower right")
pl.show()

Although the prediction results from Decision Tree,bagging and Random Forest are similar the Random Forest has highest Prediction