# Decision Tree Explained

In [1]:
data_dir="/Users/akr712/Desktop/tree"
import os
import pandas as pd

In [2]:
os.chdir(data_dir)
dat=pd.read_csv("credit_history.csv")
dat.isnull().sum()

default        0
amount         0
grade          0
years        279
ownership      0
income         0
age            0
dtype: int64

In [3]:
dat.head()

Unnamed: 0,default,amount,grade,years,ownership,income,age
0,0,1000,B,2.0,RENT,19200.0,24
1,1,6500,A,2.0,MORTGAGE,66000.0,28
2,0,2400,A,2.0,RENT,60000.0,36
3,0,10000,C,3.0,RENT,62000.0,24
4,1,4000,C,2.0,RENT,20000.0,28


In [4]:
dat['years'].describe()

count    7448.000000
mean        6.086332
std         6.700758
min         0.000000
25%         2.000000
50%         4.000000
75%         8.000000
max        62.000000
Name: years, dtype: float64

In [5]:
dat['years'].fillna(4,inplace=True)
X=dat.drop("default",axis=1)

In [6]:
X['grade'].unique()

array(['B', 'A', 'C', 'D', 'E', 'F', 'G'], dtype=object)

In [7]:
X=pd.get_dummies(X)
y=dat['default']

In [8]:
X.head()

Unnamed: 0,amount,years,income,age,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,ownership_MORTGAGE,ownership_OTHER,ownership_OWN,ownership_RENT
0,1000,2.0,19200.0,24,0,1,0,0,0,0,0,0,0,0,1
1,6500,2.0,66000.0,28,1,0,0,0,0,0,0,1,0,0,0
2,2400,2.0,60000.0,36,1,0,0,0,0,0,0,0,0,0,1
3,10000,3.0,62000.0,24,0,0,1,0,0,0,0,0,0,0,1
4,4000,2.0,20000.0,28,0,0,1,0,0,0,0,0,0,0,1


In [9]:
import sklearn.model_selection as model_selection
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=200)

In [10]:
import sklearn.tree as tree
clf=tree.DecisionTreeClassifier(max_depth=3,random_state=200)
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=200,
            splitter='best')

In [11]:
cls=clf.predict(X_train)
pred=pd.DataFrame({'y_train':y_train,'cls':cls})

In [12]:
pred.query("y_train==1 & cls==1").shape[0]

2136

In [13]:
pred.query("y_train==0 & cls==0").shape[0]

1773

In [14]:
(2136+1773)/pred.shape[0]

0.6324219381977026

In [15]:
clf.score(X_test,y_test) #Average number of correct predictions

0.6274256144890039

In [16]:
clf.predict_proba(X_test)

array([[0.7838765 , 0.2161235 ],
       [0.7838765 , 0.2161235 ],
       [0.866171  , 0.133829  ],
       ...,
       [0.33271144, 0.66728856],
       [0.62803532, 0.37196468],
       [0.49720149, 0.50279851]])

In [17]:
import sklearn.metrics as metrics
metrics.roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

0.6721250820352787

In [22]:
model=model_selection.GridSearchCV(clf, param_grid={'max_depth':[3,4,5]})
model.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=200,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [3, 4, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [23]:
model.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=200,
            splitter='best')

In [24]:
model.best_score_

0.6298333602976864

In [25]:
import sklearn.metrics as metrics
metrics.roc_auc_score(y_test,model.predict_proba(X_test)[:,1])

0.6760493818892891

In [26]:
#Feature importance
pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False).head(5)

grade_A           0.554912
grade_B           0.245647
income            0.199442
ownership_RENT    0.000000
ownership_OWN     0.000000
dtype: float64

In [27]:
###Regression Tree

In [28]:
import os
import pandas as pd

dat=pd.read_csv("dm.csv")
dat.columns

X=dat.drop(['Cust_Id','AmountSpent'],axis=1)
X=pd.get_dummies(X)
y=dat['AmountSpent']

In [29]:
X.head()

Unnamed: 0,Salary,Children,Catalogs,Age_Middle,Age_Old,Age_Young,Gender_Female,Gender_Male,OwnHome_Own,OwnHome_Rent,Married_Married,Married_Single,Location_Close,Location_Far,History_High,History_Low,History_Medium
0,47500,0,6,0,1,0,1,0,1,0,0,1,0,1,1,0,0
1,63600,0,6,1,0,0,0,1,0,1,0,1,1,0,1,0,0
2,13500,0,18,0,0,1,1,0,0,1,0,1,1,0,0,1,0
3,85600,1,18,1,0,0,0,1,1,0,1,0,1,0,1,0,0
4,68400,0,12,1,0,0,1,0,1,0,0,1,1,0,1,0,0


In [30]:
import sklearn.model_selection as model_selection

X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=200)

In [31]:
import sklearn.tree as tree
reg=tree.DecisionTreeRegressor(max_depth=3)
reg.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [32]:
reg.score(X_test,y_test)

0.5915701153457273

In [39]:
import sklearn.metrics

In [40]:
sklearn.metrics.mean_squared_error(y_test,reg.predict(X_test))

421270.24467456294

In [41]:
#Feature Importance: Total reduction of purity measure brought out by a feature
pd.Series(reg.feature_importances_,index=X.columns).sort_values(ascending=False)

Salary             0.680609
Catalogs           0.177714
History_Medium     0.059444
History_Low        0.045373
History_High       0.036859
Location_Far       0.000000
Location_Close     0.000000
Married_Single     0.000000
Married_Married    0.000000
OwnHome_Rent       0.000000
Gender_Male        0.000000
Gender_Female      0.000000
Age_Young          0.000000
Age_Old            0.000000
Age_Middle         0.000000
Children           0.000000
OwnHome_Own        0.000000
dtype: float64

In [42]:
#Interpret the tree

In [43]:
X_test.shape

(200, 17)

In [44]:
sklearn.metrics.mean_squared_error(y_train,reg.predict(X_train))

334416.1033783246