# CS3481 Assignment 1

Tasks:  
   (1) use sklearn to build a classification tree , try different parameters to the decision tree constructors  
   (2) use graphviz to visualize the model of your decision tree. And compare their performances  (accuracy and confusion matrix)


Use the following command to import the required module form sklearn

In [1]:
from typing import List, Tuple
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix
import graphviz

## Preparing Dataset
The dataset I use is the [UCI Forest Dataset](https://archive.ics.uci.edu/ml/datasets/Forest+type+mapping). I have downloaded its training dataset and testing dataset as "training.csv" and "testing.csv". These two csv files must be in the same directory as this notebook.

In [2]:
# import from csv training data as dataframe
fr_dataset = dftrain = pd.read_csv("training.csv", sep = ',')

In [3]:

# access the entries using dataframe.iloc
def select_distinct(col_idx: int) -> List[str]:
    set_of_class_labels = set()
    if (col_idx >= 0 and col_idx < len(fr_dataset.iloc[0, :])):
        for class_label in fr_dataset.iloc[:, col_idx]:
            if class_label not in set_of_class_labels:
                set_of_class_labels.add(class_label)
    else:
        print("Your column index out of bound")
    return list(set_of_class_labels)
select_distinct(0)

['d ', 'o ', 'h ', 's ']

In [4]:
# Preprocessing - Convert alphabetic labels into numerics
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
training_labels = fr_dataset.iloc[:, 0]
labels = le.fit_transform(training_labels)

## Model training

Try out different parameters for the Decision Tree constructor.  

To remove randomness, set ```random_state``` to a non-none value in the class constructor below, otherwise you will get different execution results in each run.

In [15]:
# Classification Tree A
#clf = tree.DecisionTreeClassifier(max_depth=3)
# Classification Tree B
clf = tree.DecisionTreeClassifier(max_depth=3,criterion='entropy', max_features=None, random_state=2323) # fix random seed
# Classification Tree C
# clf = tree.DecisionTreeClassifier(max_depth=3, criterion='gini', max_features='sqrt')
# Classification Tree D
# clf = tree.DecisionTreeClassifier(max_depth=4)
# Classification Tree E
# clf = tree.DecisionTreeClassifier(max_depth=2)
# Classification Tree F
# clf = tree.DecisionTreeClassifier(max_depth=3, criterion='entropy', min_impurity_decrease=0.85)
clf = clf.fit(fr_dataset.iloc[:,1:], labels)

In [16]:
# visualize the tree
lfeature_names = fr_dataset.columns[1:]
# the real class_name labels is in the internal "classes_" attributes
lclass_names = le.classes_
print(f"lclass_names: {lclass_names}")
dot_data = tree.export_graphviz(clf, out_file=None,feature_names=lfeature_names,class_names=lclass_names,filled=True, rounded=True,special_characters=True)


lclass_names: ['d ' 'h ' 'o ' 's ']


In [None]:

graph = graphviz.Source(dot_data)
filepath = graph.render('Forest_Dataset2', format='png')
print(filepath)


## Model evaluation

In [54]:
# apply the decision tree to test dataset
fr_testset = pd.read_csv("testing.csv", sep = ',')
fr_testdata = fr_testset.iloc[:,1:]
fr_prediction = clf.predict(fr_testdata)
# the outcome from predict() is encoded int. Convert them to real class labels
prediction_results = np.array([lclass_names[encoded_class] for encoded_class in fr_prediction])

In [55]:
# Evalutate prediction score
clf1_score = accuracy_score(fr_testset.iloc[:,0], prediction_results)
print("Accuracy score for this Tree: \n", clf1_score)


Accuracy score for this Tree: 
 0.7846153846153846


In [56]:
# See the Confusion Matrix for this tree
print(confusion_matrix(fr_testset.iloc[:,0], prediction_results, labels=['d ','h ','o ','s ']))

[[ 73   0  12  20]
 [  0  29   0   9]
 [ 10   1  34   1]
 [  5  12   0 119]]


## Utility functions and test runs

In [35]:
'''
@param haystack {pd.DataFrame} - the test dataset
@param needle_criteria {function} - contains criteria such that this needle can be misclassified
@return {pd.arraylike} one row from that dataset  
'''
def find_outliner(haystack, needle_criteria ):
    row_iter = haystack.iterrows()
    row_idx, row_data = next(row_iter)
    while not row_data.empty:
        try:
            if (needle_criteria(row_data) == True):
                return row_data
            row_idx, row_data = next(row_iter)
        except StopIteration as e:
            print("Exit at row", row_idx+1)
            break
    return None

In [60]:
# misclassification Type (1)
# def instance_d_mis_o_criteria(datarow) -> bool:
#     is_all_cond_met = False
#     if datarow.loc['class'] == 'd ':
#         if datarow.loc['b2'] > 35.5:
#             if datarow.loc['pred_minus_obs_H_b8'] <= 0.475:
#                 if datarow.loc['pred_minus_obs_S_b3'] <= -1.83:
#                     is_all_cond_met = True
#     return is_all_cond_met
# instance_d_mis_o =  find_outliner(fr_testset, instance_d_mis_o_criteria)# instance of 'd' misclassified as 'o'
# def instance_d_mis_s_criteria(datarow)->bool:
#     is_all_cond_met = False
#     if datarow.loc['class'] == 'd ':
#         if datarow.loc['b2'] <= 35.5:
#             if datarow.loc['pred_minus_obs_H_b1'] <= 41.39:
#                 if datarow.loc['pred_minus_obs_S_b1'] > -0.76:
#                     is_all_cond_met = True
#             else:
#                 if datarow.loc['pred_minus_obs_H_b7'] > -17.815:
#                     is_all_cond_met = True
#     return is_all_cond_met
# instance_d_mis_s = find_outliner(fr_testset, instance_d_mis_s_criteria)
def instance_h_mis_s_criteria(datarow):
    is_all_cond_met = False
    if datarow.loc['class'] == 'h ':
        if datarow.loc['b2'] <= 35.5:
            if datarow.loc['pred_minus_obs_H_b1'] <= 41.39:
                if datarow.loc['pred_minus_obs_S_b1'] > -0.76:
                    is_all_cond_met = True
            else:
                if datarow.loc['pred_minus_obs_H_b7'] > -17.815:
                    is_all_cond_met = True
    return is_all_cond_met

instance_h_mis_s = find_outliner(fr_testset, instance_h_mis_s_criteria)
print(instance_h_mis_s)

Exit at row 325
None


In [44]:
clf.feature_names_in_

array(['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9',
       'pred_minus_obs_H_b1', 'pred_minus_obs_H_b2',
       'pred_minus_obs_H_b3', 'pred_minus_obs_H_b4',
       'pred_minus_obs_H_b5', 'pred_minus_obs_H_b6',
       'pred_minus_obs_H_b7', 'pred_minus_obs_H_b8',
       'pred_minus_obs_H_b9', 'pred_minus_obs_S_b1',
       'pred_minus_obs_S_b2', 'pred_minus_obs_S_b3',
       'pred_minus_obs_S_b4', 'pred_minus_obs_S_b5',
       'pred_minus_obs_S_b6', 'pred_minus_obs_S_b7',
       'pred_minus_obs_S_b8', 'pred_minus_obs_S_b9'], dtype=object)