Bank Marketing Data : A Decision Tree Approach to predict if a customer will subscribe to a particular bank scheme

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import datasets
from io import StringIO
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))


In [None]:
#Load Data File
bank=pd.read_csv('../input/bank.csv')
bank.head()

## Summay of data

### Categorical Variables :
**[1] job      :** admin,technician, services, management, retired, blue-collar, unemployed, entrepreneur,
               housemaid, unknown, self-employed, student
<br>**[2] marital  :** married, single, divorced
<br>**[3] education:** secondary, tertiary, primary, unknown
<br>**[4] default  :** yes, no
<br>**[5] housing  :** yes, no
<br>**[6] loan     :** yes, no 
<br>**[7] deposit  :** yes, no ** (Dependent Variable)**
<br>**[8] contact  :** unknown, cellular, telephone
<br>**[9] month    :** jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec
<br>**[10] poutcome:** unknown, other, failure, success

### Numerical Variables:
**[1] age 
<br>[2] balance
<br>[3] day
<br>[4] duration
<br>[5] campaign
<br>[6] pdays
<br>[7] previous **

## Null

In [None]:
# Check if the data set contains any null values - Nothing found!
bank.isnull().sum()


In [None]:
bank.describe()

### Age

In [None]:
# Boxplot for 'age'
g = sns.boxplot(x=bank["age"])

In [None]:
# Distribution of Age
sns.distplot(bank.age, bins=100)

### Duration

In [None]:
# Boxplot for 'duration'
g = sns.boxplot(x=bank["duration"])

In [None]:
sns.distplot(bank.duration, bins=100)

In [None]:
bank_data = bank.copy()

In [None]:
#Explore People who made a deposit vs Job Category
job = bank['job'].unique().tolist()

for i in job:
    print("{:15} : {:5}".format(i,len(bank[(bank['deposit'] == 'yes') & (bank['job'] == i)])))

In [None]:
# Different types of job categories and their counts
bank_data.job.value_counts()

In [None]:
# Combine similar jobs into categiroes
bank['job'] = bank['job'].replace(['management','admin.'],'white-collar')
bank['job'] = bank['job'].replace(['housemaid','services'],'pink-collar')
bank['job'] = bank['job'].replace(['retired','student','unemployed','unknown'],'other')

In [None]:
# New Value Counts
bank.job.value_counts()

In [None]:
bank.poutcome.value_counts()

In [None]:
#Combining Unknown and Other as Other
bank['poutcome'] = bank['poutcome'].replace('other','unknown')
bank.poutcome.value_counts()

In [None]:
#dropping contact feature since everyone had been contacted
bank.drop('contact',axis=1,inplace = True)

In [None]:
# values for "default" : yes/no
bank['default'].value_counts()
bank['default_cat'] = bank['default'].map({'yes':1,'no':0})
bank.drop('default',axis = 1, inplace = True)

In [None]:
#values for 'housing' : yes/no
bank['housing_cat'] = bank['housing'].map({'yes':1,'no':0})
bank.drop('housing',axis = 1, inplace = True)

In [None]:
#values for 'loan' : yes/no
bank['loan_cat'] = bank['loan'].map({'yes':1,'no':0})
bank.drop('loan',axis = 1, inplace = True)

In [None]:
# day  : last contact day of the month
# month: last contact month of year
# Drop 'month' and 'day' as they don't have any intrinsic meaning
bank.drop(['day','month'],axis = 1,inplace = True)

In [None]:
#Values for deposit : yes/no
bank['deposit_cat'] = bank['deposit'].map({'yes':1,'no':0})
bank.drop('deposit',axis = 1, inplace = True)

In [None]:
bank.head(2)

In [None]:
#Number of days passed by since the client was last contacted from a previous campaign
# -1 means client was not contacted previously
print("Number of Customers who were not contacted as part of any previous campaign:  ",len(bank[bank['pdays'] == -1]))
print("Maximum values on pdays: ",bank['pdays'].max())

In [None]:
#Mapping pdays = -1 with a value 10000, a value so large 
bank.loc[bank['pdays'] == -1,'pdays'] = 10000

In [None]:
bank1 = bank

In [None]:
#Create a new column recent_pdays
bank['recent_pdays'] = np.where(bank['pdays'],1/bank.pdays,1/bank.pdays)
bank.drop('pdays',axis=1,inplace = True)

In [None]:
bank.tail()

In [None]:
#Convert To Dummies
bank_with_dummies = pd.get_dummies(data = bank, columns=['job','marital','education','poutcome'],prefix = ['job','marital','education','poutcome'])

In [None]:
bank_with_dummies.head(3)

In [None]:
bank_with_dummies.shape

In [None]:
bank_with_dummies.describe()

In [None]:
#Observations on whole population
#Scatter Plot showing age and balance
sns.scatterplot(data = bank_with_dummies,x ='age',y = 'balance')

In [None]:
#poutcome vs duration
bank_with_dummies.plot(x='poutcome_success',y='duration',kind='hist')

## Analysis on people who sign up for a term deposite

In [None]:
# People who sign up to a term deposite
bank_with_dummies[bank.deposit_cat == 1].describe()

In [None]:
#People signed up to a term deposit having a personal loan and a housing loan
len(bank_with_dummies[(bank_with_dummies['deposit_cat'] == 1) & (bank_with_dummies['loan_cat'] == 1) & (bank_with_dummies['housing_cat'] == 1)])

In [None]:
# People signed up to a term deposite with a credit default
len(bank_with_dummies[(bank_with_dummies['deposit_cat'] == 1)&(bank_with_dummies['default_cat'] == 1)])

In [None]:
#Number of People with White Collared jobs who opted for term deposit
len(bank_with_dummies[(bank_with_dummies['deposit_cat'] == 1)&(bank_with_dummies['job_white-collar'] == 1)])

In [None]:
#Number of People with White Collared jobs who opted for term deposit
len(bank_with_dummies[(bank_with_dummies['deposit_cat'] == 0)&(bank_with_dummies['job_white-collar'] == 1)])

In [None]:
bank['job'].value_counts()

In [None]:
# Bar chart of job Vs deposite
plt.figure(figsize = (10,6))
sns.barplot(x='job_technician', y = 'deposit_cat', data = bank_with_dummies)

In [None]:
# Bar chart of job Vs deposite
plt.figure(figsize = (10,6))
sns.barplot(x='job_other', y = 'deposit_cat', data = bank_with_dummies)

In [None]:
# Bar chart of job Vs deposite
plt.figure(figsize = (10,6))
sns.barplot(x='job_white-collar', y = 'deposit_cat', data = bank_with_dummies)

In [None]:
# Bar chart of job Vs deposite
plt.figure(figsize = (10,6))
sns.barplot(x='job', y = 'deposit_cat', data = bank)

In [None]:
#Need to find out how to use SNS to fetch the data for FALSE conditions as well
#Bar Chart of "Previous Outcome" and duration
plt.figure(figsize = (10,6))
sns.barplot(x='poutcome', y = 'duration', data = bank)

## Classification

In [None]:
# make a copy
bankcl = bank_with_dummies

In [None]:
# The Correltion matrix
corr = bankcl.corr()
corr

In [None]:
# Heatmap
plt.figure(figsize = (10,10))
cmap = sns.diverging_palette(220,10,as_cmap = True)
#Deep dive into diverging_pattern
sns.heatmap(corr,xticklabels=corr.columns.values,
           yticklabels=corr.columns.values,cmap=cmap,vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .82})
plt.title('Heatmap of Correlation Matrix')

In [None]:
# Extract the deposte_cat column (the dependent variable) to understand the correlation with other features
corr_deposit = pd.DataFrame(corr['deposit_cat'].drop('deposit_cat'))

In [None]:
corr_deposit.sort_values(by = 'deposit_cat',ascending = False)

## Build the Data Model

In [None]:
# Train-Test split: 20% test data
data_drop_deposite = bankcl.drop('deposit_cat', 1)
label = bankcl.deposit_cat
data_train, data_test, label_train, label_test = train_test_split(data_drop_deposite, label, test_size = 0.2, random_state = 50)

In [None]:
# Decision tree with depth = 2
dt2 = tree.DecisionTreeClassifier(random_state=1, max_depth=2)
dt2.fit(data_train, label_train)
dt2_score_train = dt2.score(data_train, label_train)
print("Training score: ",dt2_score_train)
dt2_score_test = dt2.score(data_test, label_test)
print("Testing score: ",dt2_score_test)

In [None]:
# Decision tree with depth = 3
dt3 = tree.DecisionTreeClassifier(random_state=1, max_depth=3)
dt3.fit(data_train, label_train)
dt3_score_train = dt3.score(data_train, label_train)
print("Training score: ",dt3_score_train)
dt3_score_test = dt3.score(data_test, label_test)
print("Testing score: ",dt3_score_test)

In [None]:
# Decision tree with depth = 4
dt4 = tree.DecisionTreeClassifier(random_state=1, max_depth=4)
dt4.fit(data_train, label_train)
dt4_score_train = dt4.score(data_train, label_train)
print("Training score: ",dt4_score_train)
dt4_score_test = dt4.score(data_test, label_test)
print("Testing score: ",dt4_score_test)

In [None]:
# Decision tree with depth = 6
dt6 = tree.DecisionTreeClassifier(random_state=1, max_depth=6)
dt6.fit(data_train, label_train)
dt6_score_train = dt6.score(data_train, label_train)
print("Training score: ",dt6_score_train)
dt6_score_test = dt6.score(data_test, label_test)
print("Testing score: ",dt6_score_test)

In [None]:
# Decision tree: To the full depth
dt1 = tree.DecisionTreeClassifier()
dt1.fit(data_train, label_train)
dt1_score_train = dt1.score(data_train, label_train)
print("Training score: ", dt1_score_train)
dt1_score_test = dt1.score(data_test, label_test)
print("Testing score: ", dt1_score_test)

## Comparing Scores

In [None]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1} {:>25} {:>20}'.format(2, dt2_score_train, dt2_score_test))
print('{:1} {:>25} {:>20}'.format(3, dt3_score_train, dt3_score_test))
print('{:1} {:>25} {:>20}'.format(4, dt4_score_train, dt4_score_test))
print('{:1} {:>25} {:>20}'.format(6, dt6_score_train, dt6_score_test))
print('{:1} {:>23} {:>20}'.format("max", dt1_score_train, dt1_score_test))

It could be seen that, higher the depth, training score increases and matches perfects with the training data set. However higher the depth the tree goes, it overfit to the training data set. So it's no use keep increasing the tree depth. According to above observations, tree with a depth of 2 seems more reasonable as both training and test scores are reasonably high.

In [None]:
# Let's generate the decision tree for depth = 2
# Create a feature vector
features = data_drop_deposite.columns.tolist()

In [None]:
len(features)

In [None]:
# Uncomment below to generate the digraph Tree.
tree.export_graphviz(dt2, out_file='tree_depth_2.dot', feature_names=features)

In [135]:
#Feature Importance Metrics
dt2 = tree.DecisionTreeClassifier(random_state=1, max_depth=2)
# Fit the decision tree classifier
dt2.fit(data_train, label_train)

fi = dt2.feature_importances_
l = len(features)
for i in range(0,len(features)):
    print('{:.<20} {:3}'.format(features[i],fi[i]))

age................. 0.0
balance............. 0.0
duration............ 0.849306123902405
campaign............ 0.0
previous............ 0.0
default_cat......... 0.0
housing_cat......... 0.0
loan_cat............ 0.0
recent_pdays........ 0.0
job_blue-collar..... 0.0
job_entrepreneur.... 0.0
job_other........... 0.0
job_pink-collar..... 0.0
job_self-employed... 0.0
job_technician...... 0.0
job_white-collar.... 0.0
marital_divorced.... 0.0
marital_married..... 0.0
marital_single...... 0.0
education_primary... 0.0
education_secondary. 0.0
education_tertiary.. 0.0
education_unknown... 0.0
poutcome_failure.... 0.0
poutcome_success.... 0.15069387609759496
poutcome_unknown.... 0.0


## Predictions

In [136]:
# According to feature importance results, most importtant feature is the "Duration"
# Let's calculte statistics on Duration
print("Mean duration   : ", data_drop_deposite.duration.mean())
print("Maximun duration: ", data_drop_deposite.duration.max())
print("Minimum duration: ", data_drop_deposite.duration.min())

Mean duration   :  371.99381831213043
Maximun duration:  3881
Minimum duration:  2


In [137]:
# Predict: Successful deposite with a call duration = 371 sec

print(dt2.predict_proba(np.array([0, 0, 371, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]).reshape(1, -1)))
print(dt2.predict(np.array([0, 0, 371, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]).reshape(1, -1)))

[[0.48515568 0.51484432]]
[1]


In [138]:
# Predict: Successful deposite with a maximun call duration = 3881 sec

print(dt2.predict_proba(np.array([0, 0, 3881, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]).reshape(1, -1)))
print(dt2.predict(np.array([0, 0, 3881, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]).reshape(1, -1)))

[[0.19295499 0.80704501]]
[1]


In [139]:
#All rows with poutcome == 1
bank_with_dummies[(bank_with_dummies.poutcome_success == 1)]

Unnamed: 0,age,balance,duration,campaign,previous,default_cat,housing_cat,loan_cat,deposit_cat,recent_pdays,job_blue-collar,job_entrepreneur,job_other,job_pink-collar,job_self-employed,job_technician,job_white-collar,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,poutcome_failure,poutcome_success,poutcome_unknown
899,56,589,518,1,2,0,1,0,1,0.006803,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
951,53,2269,1091,2,1,0,0,0,1,0.006667,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0
985,46,3354,522,1,1,0,1,0,1,0.005747,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0
994,40,3352,639,2,1,0,1,0,1,0.037037,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0
1151,31,1331,182,2,1,0,0,0,1,0.011111,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0
1198,31,12857,158,1,1,0,1,0,1,0.010870,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0
1213,33,700,126,1,1,0,0,0,1,0.011364,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0
1226,40,5060,154,2,1,0,0,0,1,0.010753,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0
1285,30,5561,195,1,1,0,1,0,1,0.010000,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0
1328,44,483,207,2,6,0,0,0,1,0.005025,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0


In [140]:
data_drop_deposite.iloc[985]

age                      46.000000
balance                3354.000000
duration                522.000000
campaign                  1.000000
previous                  1.000000
default_cat               0.000000
housing_cat               1.000000
loan_cat                  0.000000
recent_pdays              0.005747
job_blue-collar           0.000000
job_entrepreneur          0.000000
job_other                 1.000000
job_pink-collar           0.000000
job_self-employed         0.000000
job_technician            0.000000
job_white-collar          0.000000
marital_divorced          1.000000
marital_married           0.000000
marital_single            0.000000
education_primary         0.000000
education_secondary       1.000000
education_tertiary        0.000000
education_unknown         0.000000
poutcome_failure          0.000000
poutcome_success          1.000000
poutcome_unknown          0.000000
Name: 985, dtype: float64

In [141]:
# Predict: Probability for above

print(dt2.predict_proba(np.array([46,3354,522,1,1,0,1,0,0.005747,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0]).reshape(1, -1)))
#print(ctree.predict(np.array([46,3354,522,1,1,0,1,0,0.005747,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0]).reshape(1, -1)))

[[0.19295499 0.80704501]]


In [142]:
# Make predictions on the test set
preds = dt2.predict(data_test)

# Calculate accuracy
print("\nAccuracy score: \n{}".format(metrics.accuracy_score(label_test, preds)))

# Make predictions on the test set using predict_proba
probs = dt2.predict_proba(data_test)[:,1]

# Calculate the AUC metric
print("\nArea Under Curve: \n{}".format(metrics.roc_auc_score(label_test, probs)))


Accuracy score: 
0.7268248992386923

Area Under Curve: 
0.7880265888143609
