In [None]:
# Author       : Shefali Mangal
# E-Mail       : shefalimangal24@gmail.com
# Contact      : +91-9165001002
# Designation  : Software Developer
# Decision Tree for Financial Loam EMI default detection
# Data         : bank.csv

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import datasets
from io import StringIO
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
%matplotlib inline

In [None]:
# load data file
bank = pd.read_csv("/kaggle/input/bankbalanced/bank.csv")
bank.head()

In [None]:
#check if data set contains null values - Nothing found!

bank[bank.isnull().any(axis=1)].count()

In [None]:
bank.describe()

In [None]:
#box-plot for age
#a = sns.boxplot(x=bank["age"])
#a = sns.boxplot(bank["age"])
a = sns.boxplot(bank.age)

In [None]:
#Distribution of age


#sns.distplot(bank.age, bins=100)
#sns.distplot(bank["age"], bins=100)
sns.distplot(bank.age, bins=200)

In [None]:
#box-plot for duration
a = sns.boxplot(bank["duration"])

In [None]:
#Distribution of duration
sns.distplot(bank["duration"], bins=200)

# Convert categorical data

In [None]:
bank_data = bank.copy()
print(bank_data)

# EDA

In [None]:
# Explore People who made a deposit Vs Job category

In [None]:
for col in bank.columns:
    print(col)

In [None]:
jobs = ['management','blue-collar','technician','admin.','services','retired','self-employed','student',
        'unemployed','entrepreneur','housemaid','unknown']

for j in jobs:
    print("{:15} : {:5}".format(j, len(bank_data[(bank_data.deposit == "yes") & (bank_data.job == j)])))

In [None]:
# Different types of job categories and their counts

In [None]:
bank_data.job.value_counts()

In [None]:
# Combine similar jobs into categiroes

In [None]:
bank_data['job'] = bank_data['job'].replace(['management','amin.'], 'white-collar')
bank_data['job'] = bank_data['job'].replace(['services','housemaid'], 'pink-collar')
bank_data['job'] = bank_data['job'].replace(['retired','student', 'unemployed', 'unknown'], 'other')

In [None]:
bank_data.job.value_counts()

-------poutcome--------------

In [None]:
print(bank_data.poutcome)

In [None]:
bank_data.poutcome.value_counts()

In [None]:
# Combine 'unknown' and 'other' as 'other' isn't really match with either 'success' or 'failure'

In [None]:
bank_data['poutcome']= bank_data['poutcome'].replace(['other'], 'unknown')

In [None]:
bank_data.poutcome.value_counts()

-----------contact---------------------------

In [None]:
# Drop 'contact', as every participant has been contacted. 

In [None]:
bank_data.drop('contact',axis=1, inplace=True)

----------------default------------------

In [None]:
# values for "default" : yes/no

In [None]:
bank_data.default

bank_data['default_new'] = bank_data['default'].map({'yes':1 , 'no':0})
bank_data.drop('default',axis=1,inplace = True)
print(bank_data.default_new)

------housing-----------

In [None]:
# values for "housing" : yes/no
bank_data.housing

In [None]:
bank_data.housing

bank_data['housing_new'] = bank_data['housing'].map({'yes':1 , 'no':0})
bank_data.drop('housing',axis=1,inplace = True)
print(bank_data.housing_new)

---------------loan-------------

In [None]:
# values for "loan" : yes/no
bank_data.loan

In [None]:
bank_data['loan_new'] = bank_data['loan'].map({'yes':1 , 'no':0})
bank_data.drop('loan',axis=1,inplace = True)
print(bank_data.loan_new)

-----------------month,day-------------

In [None]:
# day  : last contact day of the month
# month: last contact month of year
# Drop 'month' and 'day' as they don't have any intrinsic meaning

In [None]:
bank_data.drop('month', axis=1, inplace=True)
bank_data.drop('day', axis=1, inplace=True)

-------------deposit---------------

In [None]:
# values for "deposit" : yes/no
bank_data.deposit

In [None]:
bank_data['deposit_new']= bank_data['deposit'].map({'yes':1, 'no':0})
bank_data.drop('deposit',axis = 1, inplace=True)
bank_data.deposit_new

-------------------pdays-------------

In [None]:
# pdays: number of days that passed by after the client was last contacted from a previous campaign
#       -1 means client was not previously contacted

In [None]:
bank_data.pdays

In [None]:
print('Customers that have not been contacted before: ',len(bank_data[bank_data.pdays==-1]))
print('Maximum value on pdays: ', bank_data.pdays.max())

In [None]:
# Map padys=-1 into a large value (10000 is used) to indicate that it is so far in the past that it has no effect

In [None]:
bank_data.loc[bank_data['pdays']==-1, 'pdays']=10000

In [None]:
# Create a new column: recent_pdays 
bank_data['recent_pdays'] = np.where(bank_data['pdays'], 1/bank_data.pdays, 1/bank_data.pdays)


#Drop pdays
bank_data.drop('pdays', axis=1,inplace=True)

In [None]:
bank_data.tail()

--------------convert to dummy values------------------

In [None]:
# Convert categorical variables to dummies

In [None]:
bank_dummies = pd.get_dummies(bank_data, columns = ['job', 'marital', 'education', 'poutcome'])
bank_dummies.head()

In [None]:
bank_dummies = pd.get_dummies(bank_data, columns = ['job', 'marital', 'education', 'poutcome'],prefix = ['job', 'marital', 'education', 'poutcome'])
bank_dummies.head()

In [None]:
bank_dummies.shape

In [None]:
bank_dummies.describe()

# Observation on whole populations

In [None]:
## Scatterplot showing age and balance


In [None]:
bank_dummies.plot(kind = 'scatter', x='age',y='balance')


# Across all ages, majority of people have savings of less than 20000.

In [None]:
bank_dummies.plot(kind='hist',x = 'poutcome_success', y = 'duration')

# Analysis on people who sign up for a term deposit

In [None]:
# People who sign up to a term deposite

In [None]:
bank_dummies[bank_data.deposit_new == 1].describe()

In [None]:
# People signed up to a term deposite having a personal loan (loan_new) and housing loan (housing_new)

In [None]:
len(bank_dummies[(bank_dummies.deposit_new == 1) & (bank_dummies.loan_new) & (bank_dummies.housing_new)])


In [None]:
# People signed up to a term deposite with a credit default 

In [None]:
len(bank_dummies[(bank_dummies.deposit_new == 1) & (bank_dummies.default_new == 1)])



In [None]:
# Bar chart of job Vs deposite

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(x='job', y = 'deposit_new', data = bank_data)

In [None]:
# Bar chart of "previous outcome" Vs "call duration"

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(x='poutcome', y='duration', data=bank_data)


# Classification

In [None]:
#make a copy

In [None]:
bankc1 = bank_dummies

In [None]:
#the correlation matrix

In [None]:
cr = bankc1.corr()
cr

In [None]:
#Heatmap

In [None]:
plt.figure(figsize = (10,10))
cmap = sns.diverging_palette(220,10, as_cmap = True)
sns.heatmap(cr, xticklabels=cr.columns.values, yticklabels=cr.columns.values, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .82})

In [None]:
# Extract the deposte_new column (the dependent variable)

In [None]:
corr_deposite = pd.DataFrame(cr['deposit_new'].drop('deposit_new'))
corr_deposite.sort_values(by = 'deposit_new', ascending = False)

# Build the Model

In [None]:
# Train-Test split: 20% test data

In [None]:
data_drop_deposit = bankc1.drop('deposit_new', 1)
label  = bankc1.deposit_new
data_train, data_test, label_train, label_test = train_test_split(data_drop_deposit,label,test_size = 0.2, random_state = 50)

In [None]:
# Decision tree with depth = 2

In [None]:
dt2 = tree.DecisionTreeClassifier(random_state = 1, max_depth = 2)
dt2.fit(data_train, label_train)
dt2_score_train = dt2.score(data_train, label_train)
print('Training Score: ', dt2_score_train)
dt2_score_test = dt2.score(data_test, label_test)
print('Testing Score: ', dt2_score_test)

In [None]:
# Decision tree with depth = 3

In [None]:
dt3 = tree.DecisionTreeClassifier(random_state = 1, max_depth = 3)
dt3.fit(data_train, label_train)
dt3_score_train = dt3.score(data_train, label_train)
print('Training Score: ', dt3_score_train)
dt3_score_test = dt3.score(data_test, label_test)
print('Testing Score: ', dt3_score_test)

In [None]:
# Decision tree with depth = 4


In [None]:
dt4 = tree.DecisionTreeClassifier(random_state=1, max_depth=4)
dt4.fit(data_train, label_train)
dt4_score_train = dt4.score(data_train, label_train)
print("Training score: ",dt4_score_train)
dt4_score_test = dt4.score(data_test, label_test)
print("Testing score: ",dt4_score_test)

In [None]:
# Decision tree with depth = 6

In [None]:
dt6 = tree.DecisionTreeClassifier(random_state=1, max_depth=6)
dt6.fit(data_train, label_train)
dt6_score_train = dt6.score(data_train, label_train)
print("Training score: ",dt6_score_train)
dt6_score_test = dt6.score(data_test, label_test)
print("Testing score: ",dt6_score_test)

In [None]:
# Decision tree: To the full depth

In [None]:

dt1 = tree.DecisionTreeClassifier()
dt1.fit(data_train, label_train)
dt1_score_train = dt1.score(data_train, label_train)
print("Training score: ", dt1_score_train)
dt1_score_test = dt1.score(data_test, label_test)
print("Testing score: ", dt1_score_test)

# Compare Training and Testing scores for various tree depths used

In [None]:
print('{:10} {:20} {:20}'.format('depth','Training Score','Testing Score'))
print('{:10} {:20} {:20}'.format('------','-------------','--------------'))
print('{:1} {:>25} {:>20}'.format(2, dt2_score_train, dt2_score_test))
print('{:1} {:>25} {:>20}'.format(3, dt3_score_train, dt3_score_test))
print('{:1} {:>25} {:>20}'.format(4, dt4_score_train, dt4_score_test))
print('{:1} {:>25} {:>20}'.format(6, dt6_score_train, dt6_score_test))
print('{:1} {:>23} {:>20}'.format("max", dt1_score_train, dt1_score_test))


In [None]:
# Let's generate the decision tree for depth = 2
# Create a feature vector

In [None]:
features = bankc1.columns.tolist()

# Uncomment below to generate the digraph Tree.
#tree.export_graphviz(dt2, out_file='tree_depth_2.dot', feature_names=features)

In [None]:
# Two classes: 0 = not signed up,  1 = signed up
dt2.classes_

In [None]:
# Create a feature vector
features = data_drop_deposit.columns.tolist()

features

In [None]:
# Investigate most important features with depth =2

In [None]:
dt2 = tree.DecisionTreeClassifier(random_state=1, max_depth=2)

# Fit the decision tree classifier
dt2.fit(data_train, label_train)

fi = dt2.feature_importances_


l = len(features)
for i in range(0,len(features)):
    print('{:.<20} {:3}'.format(features[i],fi[i]))

# Predictions

In [None]:
# According to feature importance results, most importtant feature is the "Duration"
# Let's calculte statistics on Duration

In [None]:
print("Mean duration   : ", data_drop_deposit.duration.mean())
print("Maximun duration: ", data_drop_deposit.duration.max())
print("Minimum duration: ", data_drop_deposit.duration.min())

In [None]:
# Get a row with poutcome_success = 1
#bank_with_dummies[(bank_with_dummies.poutcome_success == 1)]
data_drop_deposit.iloc[985]

In [None]:
# Make predictions on the test set
preds = dt2.predict(data_test)

# Calculate accuracy
print("\nAccuracy score: \n{}".format(metrics.accuracy_score(label_test, preds)))

# Make predictions on the test set using predict_proba
probs = dt2.predict_proba(data_test)[:,1]

# Calculate the AUC metric
print("\nArea Under Curve: \n{}".format(metrics.roc_auc_score(label_test, probs)))