In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

In [2]:
dataset = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx", sheet_name=1)

In [3]:
dataset.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [4]:
dataset1 = dataset.drop(['ZIP Code','ID'], axis=1)

In [5]:
dataset1.isnull().sum()

Age                   0
Experience            0
Income                0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [6]:
dataset1['CCAvg'] = np.round(dataset1['CCAvg'])

In [7]:
dataset1

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,2.0,1,0,0,1,0,0,0
1,45,19,34,3,2.0,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,3.0,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,2.0,3,0,0,0,0,1,0
4996,30,4,15,4,0.0,1,85,0,0,0,1,0
4997,63,39,24,2,0.0,3,0,0,0,0,0,0
4998,65,40,49,3,0.0,2,0,0,0,0,1,0


In [8]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [9]:
features = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Securities Account', 'CD Account', 'Online', 'CreditCard']

In [10]:
rf_model.fit(X=dataset1[features], y=dataset1['Personal Loan'])

RandomForestClassifier(max_features=2, n_estimators=1000, oob_score=True)

In [11]:
print("OOB Score")
print(rf_model.oob_score_)

OOB Score
0.9864


In [12]:
for variables,imp in zip(features, rf_model.feature_importances_):
    print(variables,imp)

Age 0.05002315553121347
Experience 0.04961159629657548
Income 0.3645662475246199
Family 0.10335567788768081
CCAvg 0.13847626722696524
Education 0.16797706504815937
Mortgage 0.04612267587092417
Securities Account 0.006284987843529536
CD Account 0.053289807059892745
Online 0.009327315134974546
CreditCard 0.010965204575464725


# Now we can say that Income, Family, Education are the important variable 
Decision Tree by using these variables

In [13]:
tree_model = tree.DecisionTreeClassifier(max_depth=6, max_leaf_nodes=12)

In [14]:
predictors = ['Education', 'CCAvg','Income']

In [15]:
tree_model.fit(X=dataset1[predictors], y=dataset1['Personal Loan'])

DecisionTreeClassifier(max_depth=6, max_leaf_nodes=12)

In [16]:
with open("Dtree.dot", "w") as f:
    f = tree.export_graphviz(tree_model, feature_names=predictors, out_file=f)