# CS3481 Assignment 2 - Random forest, NaÏve Bayesian 

In [1]:
from typing import List, Tuple
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
import graphviz

## Preparing dataset
This project uses the [UCI Forest Dataset](https://archive.ics.uci.edu/ml/datasets/Forest+type+mapping). I have downloaded the training dataset and the test dataset as "training.csv" and "testing.csv". These two csv files must be in the same directory as this notebook.  
 
List of sklearn modules:
- ensemble: sklearn.ensemble.RandomForestClassifier is a class for constructing random forests.
- preprocessing: sklearn.preprocessing.LabelEncoder is used.

In [2]:
# import from csv the training data set
fr_train = dftrain = pd.read_csv("training.csv", sep = ',')

In [3]:
# Preprocessing - Convert alphabetic labels into numerics
le = LabelEncoder()
training_labels = fr_train.iloc[:, 0]
labels = le.fit_transform(training_labels)
print(le.classes_)

In [5]:
'''
take a look at the training dataset
'''

# list of feature names
lfeature_names = fr_train.columns[1:]
print(f"lclass_names: {lfeature_names}")

# list of classes ['d ' 'h ' 'o ' 's ']
lclass_names = le.classes_
print(f"lclass_names: {lclass_names}")

lclass_names: Index(['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9',
       'pred_minus_obs_H_b1', 'pred_minus_obs_H_b2', 'pred_minus_obs_H_b3',
       'pred_minus_obs_H_b4', 'pred_minus_obs_H_b5', 'pred_minus_obs_H_b6',
       'pred_minus_obs_H_b7', 'pred_minus_obs_H_b8', 'pred_minus_obs_H_b9',
       'pred_minus_obs_S_b1', 'pred_minus_obs_S_b2', 'pred_minus_obs_S_b3',
       'pred_minus_obs_S_b4', 'pred_minus_obs_S_b5', 'pred_minus_obs_S_b6',
       'pred_minus_obs_S_b7', 'pred_minus_obs_S_b8', 'pred_minus_obs_S_b9'],
      dtype='object')
lclass_names: ['d ' 'h ' 'o ' 's ']


## Model building and training

In [9]:
# the first Random Forest
RF = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=2323)

In [10]:
# Model training
clf1 = RF.fit(fr_train.iloc[:,1:], labels)

## Model evaluation

In [11]:
# Load test dataset
fr_test = pd.read_csv("testing.csv", sep = ',')
testdata = fr_test.iloc[:, 1:].values # .values: just to remove the column headers in pandas DF, so as to save a runtime warning
fr_predictions = clf1.predict(testdata)

#print(lclass_names[fr_predictions])

# Prediction performance
clf1_score = accuracy_score(fr_test.iloc[:,0], lclass_names[fr_predictions])
print("Accuracy score for first forest: \n", clf1_score)
# See the Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[fr_predictions], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for first forest: 
 0.8123076923076923
[[ 78   0   7  20]
 [  0  29   0   9]
 [ 10   0  34   2]
 [  4   9   0 123]]


In [None]:
# look at the first component tree of the first Random Forest
clf1_first_tree = clf1.estimators_[0]
dot_data = tree.export_graphviz(clf1_first_tree, out_file=None,feature_names=lfeature_names,class_names=lclass_names,filled=True, rounded=True,special_characters=True)
graph = graphviz.Source(dot_data)
filepath = graph.render('RandFrst_clf1_tree1', format='png')
print(filepath)

### Task (a)
Build another three Random Forests, with different input parameters to the class constructor. And compare their performance with each other.

In [12]:
# second Random Forest
RF2 = RandomForestClassifier(n_estimators=15, max_depth=3, random_state=2323)
clf2 = RF2.fit(fr_train.iloc[:,1:], labels)
fr_predictions2 = clf2.predict(testdata)
# Prediction performance
clf2_score = accuracy_score(fr_test.iloc[:,0], lclass_names[fr_predictions2])
print("Accuracy score for second forest: \n", clf2_score)
# Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[fr_predictions2], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for second forest: 
 0.8184615384615385
[[ 81   0   6  18]
 [  0  28   0  10]
 [ 11   0  34   1]
 [  5   8   0 123]]


In [11]:
# third Random Forest 
RF3 = RandomForestClassifier(n_estimators=25, max_depth=3, random_state=2323)
clf3 = RF3.fit(fr_train.iloc[:,1:], labels)
fr_predictions3 = clf3.predict(testdata)
# Prediction performance
clf3_score = accuracy_score(fr_test.iloc[:,0], lclass_names[fr_predictions3])
print("Accuracy score for this forest: \n", clf3_score)
# Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[fr_predictions3], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for this forest: 
 0.8092307692307692
[[ 77   0   9  19]
 [  0  30   0   8]
 [ 11   0  33   2]
 [  6   7   0 123]]


In [10]:
# Fourth Random Forest 
RF4 = RandomForestClassifier(n_estimators=30, max_depth=3, random_state=2323)
clf4 = RF4.fit(fr_train.iloc[:,1:], labels)
fr_test = pd.read_csv("testing.csv", sep = ',')
testdata = fr_test.iloc[:, 1:]
fr_predictions4 = clf4.predict(testdata)
# Prediction performance
clf4_score = accuracy_score(fr_test.iloc[:,0], lclass_names[fr_predictions4])
print("Accuracy score for this forest: \n", clf4_score)
# Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[fr_predictions4], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for this forest: 
 0.8
[[ 75   0  10  20]
 [  0  29   0   9]
 [ 11   0  33   2]
 [  6   7   0 123]]


### Task (b)

The second random forest (n_estimators=15) has the best accuracy score. Its component trees will be analyzed in this section.

In [13]:
# look at component trees of the second forest
print("The length of the component trees is %s " % len(clf2.estimators_))
#clf2_first_tree = clf2.estimators_[0]
# dot_data = tree.export_graphviz(clf2_first_tree, out_file=None,feature_names=lfeature_names,class_names=lclass_names,filled=True, rounded=True,special_characters=True)
# graph = graphviz.Source(dot_data)
# filepath = graph.render('RandFrst_clf1_tree1', format='png')
# print(filepath)

The length of the component trees is 15 


In [30]:
# try on the 3rd component tree
clf2_third_tree = clf2.estimators_[2]
third_tree_prediction = clf2_third_tree.predict(testdata)
# Prediction performance
third_tree_score = accuracy_score(fr_test.iloc[:,0], lclass_names[np.int64(third_tree_prediction)])
print("Accuracy score for the third component tree: \n", third_tree_score)
# See the Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[np.int64(third_tree_prediction)], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for the third component tree: 
 0.7507692307692307
[[ 65   1  14  25]
 [  0  27   0  11]
 [  8   1  35   2]
 [  2  16   1 117]]


In [31]:
clf2_third_tree = clf2.estimators_[2]
print(clf2_third_tree.feature_importances_)

[0.         0.         0.         0.         0.         0.
 0.16315059 0.         0.32864118 0.         0.34942963 0.
 0.         0.01444022 0.14433838 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]


In [32]:
# try on the 6th component tree
clf2_sixth_tree = clf2.estimators_[5]
sixth_tree_prediction = clf2_sixth_tree.predict(testdata)
# Prediction performance
sixth_tree_score = accuracy_score(fr_test.iloc[:,0], lclass_names[np.int64(sixth_tree_prediction)])
print("Accuracy score for the sixth component tree: \n", sixth_tree_score)
# See the Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[np.int64(sixth_tree_prediction)], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for the sixth component tree: 
 0.7876923076923077
[[ 74   2   7  22]
 [  0  25   0  13]
 [ 11   1  34   0]
 [  1  12   0 123]]


In [35]:
# try on the 9th component tree
clf2_ninth_tree = clf2.estimators_[8]
ninth_tree_prediction = clf2_ninth_tree.predict(testdata)
# Prediction performance
ninth_tree_score = accuracy_score(fr_test.iloc[:,0], lclass_names[np.int64(ninth_tree_prediction)])
print("Accuracy score for the ninth component tree: \n", ninth_tree_score)
# See the Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[np.int64(ninth_tree_prediction)], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for the ninth component tree: 
 0.7876923076923077
[[ 79   0   2  24]
 [  1  28   0   9]
 [ 12   0  32   2]
 [ 13   6   0 117]]


In [36]:
# try on the 12th component tree
clf2_twelfth_tree = clf2.estimators_[11]
twelfth_tree_prediction = clf2_twelfth_tree.predict(testdata)
# Prediction performance
twelfth_tree_score = accuracy_score(fr_test.iloc[:,0], lclass_names[np.int64(twelfth_tree_prediction)])
print("Accuracy score for the twelfth component tree: \n", twelfth_tree_score)
# See the Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[np.int64(twelfth_tree_prediction)], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for the twelfth component tree: 
 0.7723076923076924
[[ 77   4   4  20]
 [  0  29   0   9]
 [ 13   0  30   3]
 [  7  14   0 115]]


In [38]:
# try on the 15th component tree
clf2_fifteenth_tree = clf2.estimators_[14]
fifteenth_tree_prediction = clf2_fifteenth_tree.predict(testdata)
# Prediction performance
fifteenth_tree_score = accuracy_score(fr_test.iloc[:,0], lclass_names[np.int64(fifteenth_tree_prediction)])
print("Accuracy score for the fifteenth component tree: \n", fifteenth_tree_score)
# See the Confusion Matrix for this tree
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[np.int64(fifteenth_tree_prediction)], labels=['d ', 'h ', 'o ', 's ']))

Accuracy score for the fifteenth component tree: 
 0.7846153846153846
[[ 79   2   2  22]
 [  1  30   0   7]
 [ 12   0  33   1]
 [ 11  12   0 113]]


In [None]:
# see if any component tree has better performance than the forest
test_set_labels_encoded = le.transform(fr_test.iloc[:,0])
testdata_size = testdata.shape[0]
for i in range (15):
    ith_tree = clf2.estimators_[i]
    ith_tree_prediction = ith_tree.predict(testdata)
    print(np.round(accuracy_score(fr_test.iloc[:,0], lclass_names[np.int64(ith_tree_prediction)]), 3))
    # print(ith_tree_prediction[112:124].ravel())
    # is_same_or_not = test_set_labels_encoded ^ np.int64(ith_tree_prediction) # if prediction == label, XOR will give 0
    # num_of_misclassification = len(is_same_or_not.nonzero())
    # ith_tree_accuracy_score = (testdata_size - num_of_misclassification)  / testdata_size * 1.0
    # print(ith_tree_accuracy_score)


### Task (c)
Find out all ```feature_importances_[]``` values of the selected trees in Task (b).  

In [55]:
feature_names = np.array('b1,b2,b3,b4,b5,b6,b7,b8,b9,pred_minus_obs_H_b1,pred_minus_obs_H_b2,pred_minus_obs_H_b3,pred_minus_obs_H_b4,pred_minus_obs_H_b5,pred_minus_obs_H_b6,pred_minus_obs_H_b7,pred_minus_obs_H_b8,pred_minus_obs_H_b9,pred_minus_obs_S_b1,pred_minus_obs_S_b2,pred_minus_obs_S_b3,pred_minus_obs_S_b4,pred_minus_obs_S_b5,pred_minus_obs_S_b6,pred_minus_obs_S_b7,pred_minus_obs_S_b8,pred_minus_obs_S_b9'.split(','))
print(feature_names[clf2_third_tree.feature_importances_.nonzero()])
print(feature_names[clf2_sixth_tree.feature_importances_.nonzero()])
print(feature_names[clf2_ninth_tree.feature_importances_.nonzero()])
print(feature_names[clf2_twelfth_tree.feature_importances_.nonzero()])
print(feature_names[clf2_fifteenth_tree.feature_importances_.nonzero()])

['b7' 'b9' 'pred_minus_obs_H_b2' 'pred_minus_obs_H_b5'
 'pred_minus_obs_H_b6']
['b7' 'b9' 'pred_minus_obs_H_b2' 'pred_minus_obs_H_b5'
 'pred_minus_obs_H_b6']
['pred_minus_obs_H_b1' 'pred_minus_obs_H_b2' 'pred_minus_obs_H_b7'
 'pred_minus_obs_H_b8' 'pred_minus_obs_H_b9' 'pred_minus_obs_S_b7']
['b3' 'b4' 'b6' 'pred_minus_obs_H_b2']
['b1' 'b4' 'b5' 'b9' 'pred_minus_obs_H_b8' 'pred_minus_obs_S_b5']


In [56]:
print(clf2_third_tree.feature_importances_)
print(clf2_sixth_tree.feature_importances_)
print(clf2_ninth_tree.feature_importances_)
print(clf2_twelfth_tree.feature_importances_)
print(clf2_fifteenth_tree.feature_importances_)

[0.         0.         0.         0.         0.         0.
 0.16315059 0.         0.32864118 0.         0.34942963 0.
 0.         0.01444022 0.14433838 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
[0.         0.         0.         0.         0.         0.
 0.24076416 0.         0.35391447 0.         0.06891281 0.
 0.         0.32011285 0.01629571 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.34855784 0.33227218 0.
 0.         0.         0.         0.01963593 0.25909322 0.01354952
 0.         0.         0.         0.         0.         0.
 0.02689132 0.         0.        ]
[0.         0.         0.35709851 0.06657728 0.         0.24605302
 0.         0.         0.         0.         0.33027119 0.
 0.         0.         0.         0.         0.      

### Task (d)
NaÏve Bayesian classifier

In [58]:
# Construct a NB classifier 
nb = GaussianNB()
# Train the model
nb.fit(fr_train.iloc[:,1:], labels)
# Test model on test set
testdata = fr_test.iloc[:, 1:]
NB_prediction = nb.predict(testdata)


In [61]:
# print(NB_prediction[:10])
NB_accuracy_score = accuracy_score(fr_test.iloc[:,0], lclass_names[NB_prediction])
print("NB accuracy score: %f" % NB_accuracy_score)

NB accuracy score: 0.803077


In [63]:
# Confusion Matrix
print(confusion_matrix(fr_test.iloc[:,0], lclass_names[NB_prediction], labels=['d ', 'h ', 'o ', 's ']))

[[ 81   0  11  13]
 [  0  30   0   8]
 [  8   0  37   1]
 [ 11  12   0 113]]
