## Decision tree and Random forest

In [None]:
#import libraries
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Data Generation

In [None]:
#use make_classification to generate random 3-class classification data
X, y = datasets.make_classification(n_samples = 1000, n_features = 5, n_classes = 3, n_informative = 3)

In [None]:
#create features_df dataframe containing feature columns
features_df = pd.DataFrame(X)

In [None]:
#create labels_df dataframe containing labels column
labels_df = pd.DataFrame(y)

In [None]:
#split the data into train and test using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


#### Decision tree classifier

In [None]:
#Create an instance of decision tree classifier
model_decision_tree = DecisionTreeClassifier()
#fit the model
model_decision_tree.fit(X_train,y_train)

DecisionTreeClassifier()

In [None]:
#Predict the labels for train data
y_train_pred_decision_tree = model_decision_tree.predict(X_train)

#Predict the labels for test data
y_test_pred_decision_tree = model_decision_tree.predict(X_test)

In [None]:
#create a new dataframe to store the test predictions columns to finally write into csv file
test_predictions_df = pd.DataFrame({ 'DT_test_predicted' : y_test_pred_decision_tree})

In [None]:
#print accuracy and confusion matrix for train data
print('confusion matrix')
print(confusion_matrix(y_true = y_train, y_pred = y_train_pred_decision_tree))
print('accuracy', accuracy_score(y_true = y_train, y_pred = y_train_pred_decision_tree))

confusion matrix
[[235   0   0]
 [  0 243   0]
 [  0   0 222]]
accuracy 1.0


In [None]:
#Print precision, recall, f1-score for each class using 'average' parameter for train data
print('precision', precision_score(y_true = y_train, y_pred = y_train_pred_decision_tree, average = 'macro'))
print('recall', recall_score(y_true = y_train, y_pred = y_train_pred_decision_tree, average = 'macro'))
print('f1-score', f1_score(y_true = y_train, y_pred = y_train_pred_decision_tree, average = 'macro'))
print(classification_report(y_true = y_train, y_pred = y_train_pred_decision_tree))

precision 1.0
recall 1.0
f1-score 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       235
           1       1.00      1.00      1.00       243
           2       1.00      1.00      1.00       222

    accuracy                           1.00       700
   macro avg       1.00      1.00      1.00       700
weighted avg       1.00      1.00      1.00       700



In [None]:
#print accuracy and confusion matrix for test data
print('confusion matrix')
print(confusion_matrix(y_true = y_test, y_pred = y_test_pred_decision_tree))
print('accuracy', accuracy_score(y_true = y_test, y_pred = y_test_pred_decision_tree))

confusion matrix
[[80  7 12]
 [ 9 69 14]
 [21 10 78]]
accuracy 0.7566666666666667


In [None]:
#Print precision, recall, f1-score for each class using 'average' parameter for test data
print('precision', precision_score(y_true = y_test, y_pred = y_test_pred_decision_tree, average = 'macro'))
print('recall', recall_score(y_true = y_test, y_pred = y_test_pred_decision_tree, average = 'macro'))
print('f1-score', f1_score(y_true = y_test, y_pred = y_test_pred_decision_tree, average = 'macro'))
print(classification_report(y_true = y_test, y_pred = y_test_pred_decision_tree))

precision 0.7598661028893586
recall 0.7578923794520125
f1-score 0.7577418347693458
              precision    recall  f1-score   support

           0       0.73      0.81      0.77        99
           1       0.80      0.75      0.78        92
           2       0.75      0.72      0.73       109

    accuracy                           0.76       300
   macro avg       0.76      0.76      0.76       300
weighted avg       0.76      0.76      0.76       300



#### Random forest classifier

In [None]:
#Create an instance of random forest classifier
model_random_forest = RandomForestClassifier()
#fit the model 
model_random_forest.fit(X_train,y_train)

RandomForestClassifier()

In [None]:
#Predict the labels for train data
y_train_pred_random_forest = model_random_forest.predict(X_train)
#Predict the labels for test data
y_test_pred_random_forest = model_random_forest.predict(X_test)

In [None]:
#Add the new predicted column into above created predictions dataframe 
test_predictions_df['RF_test_predicted'] = y_test_pred_random_forest

In [None]:
#print accuracy and confusion matrix for train data
print('confusion matrix')
print(confusion_matrix(y_true = y_train, y_pred = y_train_pred_random_forest))
print('accuracy', accuracy_score(y_true = y_train, y_pred = y_train_pred_random_forest))

confusion matrix
[[235   0   0]
 [  0 243   0]
 [  0   0 222]]
accuracy 1.0


In [None]:
#Print precision, recall, f1-score for each class  using 'average' parameter for train data
print('precision', precision_score(y_true = y_train, y_pred = y_train_pred_random_forest, average = 'macro'))
print('recall', recall_score(y_true = y_train, y_pred = y_train_pred_random_forest, average = 'macro'))
print('f1-score', f1_score(y_true = y_train, y_pred = y_train_pred_random_forest, average = 'macro'))
print(classification_report(y_true = y_train, y_pred = y_train_pred_random_forest))

precision 1.0
recall 1.0
f1-score 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       235
           1       1.00      1.00      1.00       243
           2       1.00      1.00      1.00       222

    accuracy                           1.00       700
   macro avg       1.00      1.00      1.00       700
weighted avg       1.00      1.00      1.00       700



In [None]:
#print accuracy and confusion matrix for test data
print('confusion matrix')
print(confusion_matrix(y_true = y_test, y_pred = y_test_pred_random_forest))
print('accuracy', accuracy_score(y_true = y_test, y_pred = y_test_pred_random_forest))

confusion matrix
[[85  4 10]
 [ 5 79  8]
 [15  9 85]]
accuracy 0.83


In [None]:
#Print precision, recall, f1-score for each class using 'average' parameter for test data
print('precision', precision_score(y_true = y_test, y_pred = y_test_pred_random_forest, average = 'macro'))
print('recall', recall_score(y_true = y_test, y_pred = y_test_pred_random_forest, average = 'macro'))
print('f1-score', f1_score(y_true = y_test, y_pred = y_test_pred_random_forest, average = 'macro'))
print(classification_report(y_true = y_test, y_pred = y_test_pred_random_forest))

precision 0.8311540600481081
recall 0.8323660081737465
f1-score 0.8313052593200255
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        99
           1       0.86      0.86      0.86        92
           2       0.83      0.78      0.80       109

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300



### Observations

In [None]:
#call the appropriate method on above learned decision tree classifier to get the depth of the tree
print('Depth of tree', model_decision_tree.tree_.max_depth)

Depth of tree 13


In [None]:
#call the appropriate method on above learned decision tree classifier to get the number of leaves
print('Number of leaves', model_decision_tree.tree_.n_leaves)

Number of leaves 114


#### Write your observations on the predictions of the above models 

###Observation
#####Random forest has more accuracy , precision , recall and f1-score than Decision tree
#####For test data
model | accuracy | precision | recall | f1-score
--- | --- | --- | --- | ---
decision tree | 0.756 | 0.759 | 0.757 | 0.757
random forest | 0.83 | 0.831 | 0.832 | 0.831


### Hyper parameter tuning

#### Decision tree classifier

In [None]:
#Define a param_grid dictionary for the given hyper parameters
param_grid = {'criterion' : ['gini', 'entropy'], 'max_depth' : [5,10,15], 'min_samples_split' : [10,20,30], 'min_samples_leaf' : [1,1,1], 'max_leaf_nodes' : [100,200,300],'splitter' : ['best','random'], 'max_features' : ['auto','sqrt','log2']}

#call decision tree classifier
dtc = DecisionTreeClassifier()
#apply gridsearchcv for hyper parameter tuning
gcv = GridSearchCV(dtc,param_grid=param_grid)
#fit the model
gcv.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'max_leaf_nodes': [100, 200, 300],
                         'min_samples_leaf': [1, 1, 1],
                         'min_samples_split': [10, 20, 30],
                         'splitter': ['best', 'random']})

In [None]:
#print the best parameters
decision_tree_best_params = gcv.best_params_
print(decision_tree_best_params)

{'criterion': 'entropy', 'max_depth': 5, 'max_features': 'log2', 'max_leaf_nodes': 200, 'min_samples_leaf': 1, 'min_samples_split': 20, 'splitter': 'best'}


In [None]:
#train the classifier with best parameters
model_tuned_decision_tree = DecisionTreeClassifier(criterion = decision_tree_best_params['criterion'], max_depth = decision_tree_best_params['max_depth'], max_features = decision_tree_best_params['max_features'], max_leaf_nodes = decision_tree_best_params['max_leaf_nodes'], min_samples_leaf = decision_tree_best_params['min_samples_leaf'], min_samples_split = decision_tree_best_params['min_samples_split'], splitter = decision_tree_best_params['splitter'])
model_tuned_decision_tree.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='log2',
                       max_leaf_nodes=200, min_samples_split=20)

In [None]:
#Predict the labels for train data
y_train_pred_tuned_decision_tree = model_tuned_decision_tree.predict(X_train)
#Predict the labels for test data
y_test_pred_tuned_decision_tree = model_tuned_decision_tree.predict(X_test)

In [None]:
#Add the new predicted column for test data to the above created predictions dataframe 
test_predictions_df['Tuned_DF_test_predicted'] = y_test_pred_tuned_decision_tree
test_predictions_df.to_csv(path)

In [None]:
#print accuracy and confusion matrix for train data
print('confusion matrix')
print(confusion_matrix(y_true = y_train, y_pred = y_train_pred_tuned_decision_tree))
print('accuracy', accuracy_score(y_true = y_train, y_pred = y_train_pred_tuned_decision_tree))

confusion matrix
[[194  20  21]
 [  2 232   9]
 [ 35  19 168]]
accuracy 0.8485714285714285


In [None]:
#Print precision, recall, f1-score for each class using 'average' parameter for train data
print('precision', precision_score(y_true = y_train, y_pred = y_train_pred_tuned_decision_tree, average = 'macro'))
print('recall', recall_score(y_true = y_train, y_pred = y_train_pred_tuned_decision_tree, average = 'macro'))
print('f1-score', f1_score(y_true = y_train, y_pred = y_train_pred_tuned_decision_tree, average = 'macro'))
print(classification_report(y_true = y_train, y_pred = y_train_pred_tuned_decision_tree))

precision 0.848133416399099
recall 0.8456737273128132
f1-score 0.8451139203865444
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       235
           1       0.86      0.95      0.90       243
           2       0.85      0.76      0.80       222

    accuracy                           0.85       700
   macro avg       0.85      0.85      0.85       700
weighted avg       0.85      0.85      0.85       700



In [None]:
#print accuracy and confusion matrix for test data
print('confusion matrix')
print(confusion_matrix(y_true = y_test, y_pred = y_test_pred_tuned_decision_tree))
print('accuracy', accuracy_score(y_true = y_test, y_pred = y_test_pred_tuned_decision_tree))

confusion matrix
[[77  8 14]
 [ 2 82  8]
 [27 14 68]]
accuracy 0.7566666666666667


In [None]:
#Print precision, recall, f1-score for each class using 'average' parameter for test data
print('precision', precision_score(y_true = y_test, y_pred = y_test_pred_tuned_decision_tree, average = 'macro'))
print('recall', recall_score(y_true = y_test, y_pred = y_test_pred_tuned_decision_tree, average = 'macro'))
print('f1-score', f1_score(y_true = y_test, y_pred = y_test_pred_tuned_decision_tree, average = 'macro'))
print(classification_report(y_true = y_test, y_pred = y_test_pred_tuned_decision_tree))

precision 0.7568107294522388
recall 0.7643117788710129
f1-score 0.7571237638332695
              precision    recall  f1-score   support

           0       0.73      0.78      0.75        99
           1       0.79      0.89      0.84        92
           2       0.76      0.62      0.68       109

    accuracy                           0.76       300
   macro avg       0.76      0.76      0.76       300
weighted avg       0.76      0.76      0.75       300



#### Random forest claassifier

In [None]:
# Create the random grid dictionary for the given hyper parameters
random_grid = { 'n_estimators' : [100,150,200], 'max_features' : ['sqrt','log2',None], 'max_depth' : [10,15,20],'min_samples_split' : [2,3,5] ,'min_samples_leaf' : [1,5,10], 'bootstrap' : [True,False]}
#call random forest classifier
rfc = RandomForestClassifier()
#Apply randomised search cv for hyper parameter tuning
rcv = RandomizedSearchCV(rfc, param_distributions = random_grid)
#fit the model
rcv.fit(X_train,y_train)


RandomizedSearchCV(estimator=RandomForestClassifier(),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 15, 20],
                                        'max_features': ['sqrt', 'log2', None],
                                        'min_samples_leaf': [1, 5, 10],
                                        'min_samples_split': [2, 3, 5],
                                        'n_estimators': [100, 150, 200]})

In [None]:
#print the best parameters
randomized_forest_best_params = rcv.best_params_
print(randomized_forest_best_params)


{'n_estimators': 150, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 15, 'bootstrap': True}


In [None]:
#train the classifier with best parameters
model_tuned_random_forest = RandomForestClassifier(n_estimators=randomized_forest_best_params['n_estimators'],min_samples_split = randomized_forest_best_params['min_samples_split'],min_samples_leaf=randomized_forest_best_params['min_samples_leaf'],max_features=randomized_forest_best_params['max_features'],max_depth=randomized_forest_best_params['max_depth'],bootstrap=randomized_forest_best_params['bootstrap'])
model_tuned_random_forest.fit(X_train,y_train)

RandomForestClassifier(max_depth=15, max_features='log2', min_samples_split=3,
                       n_estimators=150)

In [None]:
#Predict the labels for train data
y_train_pred_tuned_random_forest = model_tuned_random_forest.predict(X_train)
#Predict the labels for test data
y_test_pred_tuned_random_forest = model_tuned_random_forest.predict(X_test)

In [None]:
#Add the new predicted column for test data to the above created predictions dataframe 
test_predictions_df['Tuned_RF_test_predicted'] = y_test_pred_tuned_random_forest

In [None]:
#print accuracy and confusion matrix for train data
print('confusion matrix')
print(confusion_matrix(y_true = y_train, y_pred = y_train_pred_tuned_random_forest))
print('accuracy', accuracy_score(y_true = y_train, y_pred = y_train_pred_tuned_random_forest))
#Print precision, recall, f1-score for each class using 'average' parameter for train data
print('precision', precision_score(y_true = y_train, y_pred = y_train_pred_tuned_random_forest, average = 'macro'))
print('recall', recall_score(y_true = y_train, y_pred = y_train_pred_tuned_random_forest, average = 'macro'))
print('f1-score', f1_score(y_true = y_train, y_pred = y_train_pred_tuned_random_forest, average = 'macro'))
print(classification_report(y_true = y_train, y_pred = y_train_pred_tuned_random_forest))


confusion matrix
[[234   0   1]
 [  0 243   0]
 [  0   0 222]]
accuracy 0.9985714285714286
precision 0.9985052316890881
recall 0.9985815602836879
f1-score 0.9985402042755723
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       235
           1       1.00      1.00      1.00       243
           2       1.00      1.00      1.00       222

    accuracy                           1.00       700
   macro avg       1.00      1.00      1.00       700
weighted avg       1.00      1.00      1.00       700



In [None]:
#print accuracy and confusion matrix for test data
print('confusion matrix')
print(confusion_matrix(y_true = y_test, y_pred = y_test_pred_tuned_random_forest))
print('accuracy', accuracy_score(y_true = y_test, y_pred = y_test_pred_tuned_random_forest))
#Print precision, recall, f1-score for each class using 'average' parameter for test data
print('precision', precision_score(y_true = y_test, y_pred = y_test_pred_tuned_random_forest, average = 'macro'))
print('recall', recall_score(y_true = y_test, y_pred = y_test_pred_tuned_random_forest, average = 'macro'))
print('f1-score', f1_score(y_true = y_test, y_pred = y_test_pred_tuned_random_forest, average = 'macro'))
print(classification_report(y_true = y_test, y_pred = y_test_pred_tuned_random_forest))

confusion matrix
[[84  4 11]
 [ 3 79 10]
 [16  9 84]]
accuracy 0.8233333333333334
precision 0.8247432109188125
recall 0.825940900831208
f1-score 0.8251418498209024
              precision    recall  f1-score   support

           0       0.82      0.85      0.83        99
           1       0.86      0.86      0.86        92
           2       0.80      0.77      0.79       109

    accuracy                           0.82       300
   macro avg       0.82      0.83      0.83       300
weighted avg       0.82      0.82      0.82       300



In [None]:
#write all the test predictions to the csv file
path = '/content/drive/MyDrive/test_predictions.csv'
test_predictions_df.to_csv(path)