### Bank Marketing Data - A Decision Tree Approach
### Aim:
The aim of this attempt is to predict if the client will subscribe (yes/no) to a term deposit, by building a classification model using Decision Tree.


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score

In [None]:
df=pd.read_csv('bank.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.deposit.value_counts()

In [None]:
df.education.dtype

### Numerical variables

### Age

In [None]:
sns.distplot(df.age,bins=10)

In [None]:
from scipy.stats import kurtosis
from scipy.stats import skew

K=kurtosis(df['age'])
s=skew(df['age'])
print('k:',K)
print('s:',s)

In [None]:
df['age']=df['age'].apply(np.log)
    
sns.distplot(df['age'])

In [None]:
sns.boxplot(df.age)

### 2. Balance

In [None]:
sns.distplot(df.balance)

In [None]:
sns.boxplot(df.balance)

In [None]:
df.balance.describe()

In [None]:
K=kurtosis(df['balance'])
s=skew(df['balance'])
print('k:',K)
print('s:',s)

In [None]:
df['balance']=df['balance'].apply(np.cbrt)
    
sns.distplot(df['balance'])

### 3. day

In [None]:
sns.distplot(df.day)

In [None]:
sns.boxplot(df.day)

### 4. pdays

In [None]:
sns.distplot(df.pdays,bins=10)

In [None]:
df['pdays']=df['pdays'].apply(np.log)
    
sns.distplot(df['pdays'])

In [None]:
sns.boxplot(df.pdays)

### 5. duration

In [None]:
sns.distplot(df.duration,bins=10)

In [None]:
df['duration']=df['duration'].apply(np.log)
    
sns.distplot(df['duration'])

In [None]:
sns.boxplot(df.duration)

### 6. campaign

In [None]:
sns.distplot(df.campaign,bins=5)

In [None]:
df['campaign']=df['campaign'].apply(np.log)
    
sns.distplot(df['campaign'])

In [None]:
sns.boxplot(df.campaign)

### 7. PREVIOUS

In [None]:
sns.distplot(df.previous)

In [None]:
df['previous']=df['previous'].apply(np.cbrt)
    
sns.distplot(df['previous'])

In [None]:
sns.boxplot(df.previous)

### encoding categorical variables

In [None]:
df.columns

In [None]:
df.poutcome.value_counts()

In [None]:
df.month.value_counts().plot()

In [None]:
df.deposit.value_counts()

In [None]:
depositmapping={'yes':1,'no':0}
df.deposit=df.deposit.map(depositmapping)

In [None]:
df.deposit.value_counts()

### Job vs deposit

In [None]:
df[['job','deposit']].groupby('job').mean().sort_values('deposit',ascending=True)

In [None]:
df['job']=df['job'].replace(['management','technician','unknown','admin.','housemaid','self-employed','services',
                                'blue-collar','entrepreneur'],'rare',regex=True)

In [None]:
jobmapping={'student':3,'retired':2,'unemployed':1,'rare':0}
df['job']=df['job'].map(jobmapping)

In [None]:
df['job'].value_counts()

In [None]:
df.columns

In [None]:
df[['marital','deposit']].groupby('marital').mean().sort_values('deposit',ascending=True)

In [None]:
statusmapping={'married':1,'divorced':2,'single':3}
df['marital']=df['marital'].map(statusmapping)


In [None]:
df['marital']

In [None]:
df[['education','deposit']].groupby('education').mean().sort_values('deposit',ascending=True)

In [None]:
educationmapping={'primary':1,'secondary':2,'unknown':3,'tertiary':4}
df['education']=df['education'].map(educationmapping)

In [None]:
df['education']

In [None]:
df.deposit

In [None]:
df[['deposit','default']].groupby('default').mean().sort_values('deposit',ascending=True)

In [None]:
df.columns

In [None]:
defaultmapping={'no':1,'yes':2}
df['default']=df['default'].map(defaultmapping)

In [None]:
df.loan.value_counts()

In [None]:
df[['deposit','loan']].groupby('loan').mean().sort_values('deposit',ascending=True)

In [None]:
loanmapping={'no':1,'yes':2}
df['loan']=df['loan'].map(loanmapping)

In [None]:
df[['deposit','contact']].groupby('contact').mean().sort_values('deposit',ascending=True)

In [None]:
contactmapping={'unknown':1,'telephone':2,'cellular':3}
df['contact']=df['contact'].map(contactmapping)

In [None]:
df['contact'].value_counts()

In [None]:
df[['deposit','poutcome']].groupby('poutcome').mean().sort_values('deposit',ascending=True)

In [None]:
poutcomemap={'unknown':1,'failure':2,'other':3,'success':4}
df['poutcome']=df['poutcome'].map(poutcomemap)

In [None]:
df.head()

In [None]:
df[['deposit','month']].groupby('month').mean().sort_values('deposit',ascending=True)

In [None]:
df['month']=df['month'].replace(['mar','dec','sep','oct'],2,regex=True)
df['month']=df['month'].replace(['apr','feb','aug','jun'],1,regex=True)
df['month']=df['month'].replace(['nov','jul','jan','may'],0,regex=True)

### feature selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt


X = df[['age', 'job', 'marital', 'education', 'default', 'balance',
       'loan', 'contact', 'month', 'duration', 'campaign', 
        'poutcome']]  #independent columns
y = df['deposit']    #target column i.e price range
model = ExtraTreesClassifier()
model.fit(X,y)

In [None]:
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')
plt.show()

In [None]:
nrows=len(df.index)
percentage=round((nrows*70)/100)
trainingData=df.iloc[:percentage,:]
testData=df.iloc[percentage:,:]

print("Number of training data examples "+str(len(trainingData.index)))
print("Number of test examples "+str(len(testData.index)))


In [None]:
train_x=trainingData[['age','loan','month','poutcome','balance','campaign','contact','duration','education']]
train_y=trainingData["deposit"]

test_x=testData[['age','loan','month','poutcome','balance','campaign','contact','duration','education']]
test_y=testData["deposit"]

train_x.head()

#featureNames=["job","marital","education","age","balance","day","pdays","duration"]
#classNames=[1,0]

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)


# fit the model
clf_gini.fit(train_x, train_y)

In [None]:
y_pred_gini = clf_gini.predict(test_x)

In [None]:
from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(clf_gini,filled=True)

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(test_y, y_pred_gini)))

In [None]:
y_pred_train_gini = clf_gini.predict(train_x)

y_pred_train_gini

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(train_y, y_pred_train_gini)))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(test_y,y_pred_gini))

In [None]:
cm=confusion_matrix(test_y,y_pred_gini)
print(cm)
print ("Accuracy of prediction:",round((cm[0,0]+cm[1,1])/cm.sum(),3))

### Training the Random Forest model
Now its time to train our model!

Create an instance of the RandomForestClassifier class and fit it to our training data from the previous step.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(train_x, train_y)


### Predictions and Evaluation
Let's predict off the y_test values and evaluate our model.


In [None]:
rfc_pred = rfc.predict(test_x)



In [None]:
print(classification_report(test_y,rfc_pred))

In [None]:
cm=confusion_matrix(test_y,rfc_pred)
print(cm)
print ("Accuracy of prediction:",round((cm[0,0]+cm[1,1])/cm.sum(),3))