# Decision Tree Classifier

- Ensemble method - Collection of Decision Trees

In [1]:
import pickle as pkl

with open('../data/titanic_tansformed.pkl', 'rb') as f:
    df_data = pkl.load(f)

In [2]:
df_data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,2,3,male,Q,S
0,0,22.0,1,0,7.25,0,1,1,0,1
1,1,38.0,1,0,71.2833,0,0,0,0,0
2,1,26.0,0,0,7.925,0,1,0,0,1
3,1,35.0,1,0,53.1,0,0,0,0,1
4,0,35.0,0,0,8.05,0,1,1,0,1


In [3]:
df_data.shape

(889, 10)

In [4]:
data = df_data.drop("Survived",axis=1)
label = df_data["Survived"]

In [5]:
from sklearn.model_selection import train_test_split  
data_train, data_test, label_train, label_test = train_test_split(data, label, test_size = 0.2, random_state = 101)

In [6]:
from sklearn.ensemble import RandomForestClassifier
import time

tic = time.time()
rf_cla = RandomForestClassifier()
rf_cla.fit(data_train,label_train)
print('Time taken for training Decision Tree', (time.time()-tic), 'secs')

predictions = rf_cla.predict(data_test)
print('Accuracy', rf_cla.score(data_test, label_test))

from sklearn.metrics import classification_report, confusion_matrix                
print(confusion_matrix(label_test, predictions))  
print(classification_report(label_test, predictions)) 

Time taken for training Decision Tree 0.013599872589111328 secs
Accuracy 0.8146067415730337
[[95 12]
 [21 50]]
             precision    recall  f1-score   support

          0       0.82      0.89      0.85       107
          1       0.81      0.70      0.75        71

avg / total       0.81      0.81      0.81       178



### Hyperparameters for Random Forest
- There are a number of [hyperparameters](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) for a random forest classifier
- Mostly commonly tuned parameter are 
    - __n_estimators__ - number of trees the algorithm builds before taking the maximum voting or taking averages of predictions
    - __n_jobs__ - tells the engine how many processors it is allowed to use. If it has a value of 1, it can only use one processor. A value of “-1” means that there is no limit.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

n_estimators = [2,3,4,5,6,7,8]
n_jobs = [1, 2, 3, 4, 5, 10, 20]
score_func = 'accuracy'

rf_cla = RandomForestClassifier()
rf_grid = GridSearchCV(estimator=rf_cla, 
                    param_grid=[{'n_estimators':n_estimators, 'n_jobs':n_jobs}], 
                    cv=5, 
                    scoring=score_func)
rf_grid.fit(data_train, label_train)
print('Best Score', dt_grid.best_score_)
print('Best Number of Trees', dt_grid.best_estimator_.n_estimators)
print('Best Jobs', dt_grid.best_estimator_.n_jobs)