# Decision Tree

In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
%run adspy_shared_utilities.ipynb  
%matplotlibinline

UsageError: Line magic function `%matplotlibinline` not found.


In [9]:
data = load_iris()

In [10]:
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [11]:
X = pd.DataFrame(data['data'],columns=data['feature_names'])
X.shape


(150, 4)

In [12]:
y = pd.DataFrame(data['target'],columns=['class'])
y.shape


(150, 1)

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,stratify=y,random_state=0)

In [14]:
y_test.shape

(30, 1)

In [15]:
clf = DecisionTreeClassifier(max_depth=3).fit(X_train,y_train)
print(clf.score(X_train,y_train))
print(clf.score(X_test,y_test))
y_pred = clf.predict(X_test)

0.9666666666666667
0.9333333333333333


# Classification report

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.83      0.91        12
           2       0.80      1.00      0.89         8

    accuracy                           0.93        30
   macro avg       0.93      0.94      0.93        30
weighted avg       0.95      0.93      0.93        30



In [19]:
import sklearn
print(sklearn.__version__)

0.21.2


# Cross Validation

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
print('accuracy',cross_val_score(clf,X_train,y_train,cv=5))

accuracy [0.95833333 0.91666667 0.91666667 0.95833333 0.91666667]


In [22]:
#print('AUC',cross_val_score(clf,X_train,y_train,cv=4,scoring='roc_auc'))
#print('recall',cross_val_score(clf,X_train,y_train,cv=4,scoring='recall'))
#print('precision',cross_val_score(clf,X_train,y_train,cv=4,scoring='precision'))

# GridSearch

In [23]:
min_samples_split = np.arange(2,10,1)
min_samples_leaf = np.arange(2,10,1)
min_samples_leaf

array([2, 3, 4, 5, 6, 7, 8, 9])

In [24]:
from sklearn.model_selection import GridSearchCV

grid = {'max_depth':[1,2,3,4,5,67,8,9],'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}
#help(GridSearchCV)
grid_clf = GridSearchCV(clf,cv=4,param_grid=grid)
grid_clf.fit(X_train,y_train)
print(grid_clf.best_params_)
print(grid_clf.best_score_)


{'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2}
0.9583333333333334


In [25]:
grid_clf = GridSearchCV(clf,cv=5,param_grid=grid)
grid_clf.fit(X_train,y_train)
print(grid_clf.best_params_)
print(grid_clf.best_score_)

{'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2}
0.9583333333333334


# RandomForest

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [27]:
y_t = np.asarray(y_train).squeeze(-1)

In [28]:
clf_rf = RandomForestClassifier(n_jobs=-1,random_state=0).fit(X_train,y_train) # n_jobs = -1, to use all core of cpu
y_pred = clf_rf.predict(X_train)
clf_rf.score(X_test,y_test)

  """Entry point for launching an IPython kernel.


0.9333333333333333

In [29]:
n_estimators = np.arange(5,15,1) # number  of trees used
max_depth = np.arange(3,10,1)    
max_features = [2,3]            

In [32]:
grid = {'n_estimators':n_estimators,'max_depth':max_depth,'max_features':max_features}
grid_rf = GridSearchCV(clf_rf,param_grid=grid)

In [34]:
grid_rf.fit(X_train,np.asarray(y_train).squeeze(-1))
print(grid_rf.best_params_)
print(grid_rf.best_score_)



{'max_depth': 3, 'max_features': 3, 'n_estimators': 7}
0.9583333333333334


