In [30]:
from helperfunctions import train_test_split
from collections import Counter
import numpy as np

In [31]:
# Not allowed to use pandas
import pandas as pd
df = pd.read_csv("Iris.csv")
del pd

In [32]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [33]:
# This will mess up the accuracy, so we can get rid of this
df.drop(columns='Id', inplace=True)

In [34]:
df.shape

(150, 5)

In [35]:
df.tail()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [36]:
class DecisionTreeClassifier:
    '''Can handle numeric features and dataframes that are free of missing data.
    If missing values, please impute first.'''

    def __init__(self, min_samples=2, max_depth=5, tree=None, append=None):
        
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.tree = None
        self.append = None
        
        
    def fit(self, train_df):
        
        # run method decision_tree and set it as the self.tree attribute
        self.tree = self.decision_tree(train_df, self.min_samples, self.max_depth)
        
        return self
    
    
    def predict(self, test_df):

        # if a tree has been fitted
        if self.tree:
            
            # store true labels for our test_df
            y_test_true = test_df.iloc[:, -1]
            
            while self.append not in ['Y', 'y', 'N', 'n']:
                
                # ask the user if they want the DTC's predictions appended
                self.append = input('Append predictions as new column in input dataframe? Y or N')
                
                # if they do 
                if self.append in ['Y', 'y']:
                    
                    continue
            
                # if they don't want their predictions appended
                elif self.append in ['N', 'n']:
                    
                    # create a copy of the test_df so results don't get appended
                    test_df = test_df.copy()
                    
                # if not a valid input  
                else:
                    
                    print("Not a valid input. Try again you silly goose.")
            
            # create a new column for our predictions
            test_df['predictions'] = test_df.apply(self.classify_observation, axis=1, args=(self.tree,))
            
            # return True or False where our predictions are equal to our labels
            test_df['correct_predictions'] = test_df['predictions'] == y_test_true

            # return how accurate the predictions are
            return ("Accuracy:", test_df['correct_predictions'].mean())
        
        # if a tree wasn't fitted
        else:
            print("NotFittedError: Please fit your decision tree first!")

    def entropy(self, data):

        # labels for input data
        labels = data[:,-1]

        # get unique labels along with their counts
        _, label_counts = np.unique(labels, return_counts=True)

        # array of probabilities of each label
        probabilities = label_counts / label_counts.sum()
        
        # return entropy
        return sum(probabilities * -np.log2(probabilities))
    
    
    def overall_entropy(self, data_below_threshold, data_above_threshold):

        '''Overall entropy'''

        p_below = len(data_below_threshold) / (len(data_below_threshold) + len(data_above_threshold))
        p_above = len(data_above_threshold) / (len(data_below_threshold) + len(data_above_threshold))

        return (p_below * self.entropy(data_below_threshold)) + (p_above * self.entropy(data_above_threshold))


    def purity(self, data):

        # last column of the df must be the labels!
        labels = data[:, -1]

        # if data has only one kind of label
        if len(np.unique(labels)) == 1:

            # it is pure
            return True

        # if data has a few different labels still
        else:

            # it isn't pure
            return False
        
    
    def make_classification(self, data):

        '''Once the max depth or min samples or purity is 1, 
        we classify the data with whatever the majority of the labels are'''

        # labels for input data
        labels = data[:,-1]

        # instantiate a Counter object on the labels
        counter = Counter(labels)

        # return the most common class/label
        return counter.most_common(1)[0][0]

    
    def split_data(self, data, split_feature, split_threshold):

        # array of only the split_feature
        feature_values = data[:,split_feature]

        # array where feature values do not exceed threshold, array where does exceed threshold
        return data[feature_values <= split_threshold], data[feature_values > split_threshold]


    def potential_splits(self, data):
        
        # dictionary of splits
        potential_splits = {}

        # store the number of features (not including labels/target)
        n_features = len(data[0])

        # for each feature in possible features
        for feature in range(n_features - 1):

            # for our dictionary, each feature should be a key
            potential_splits[feature] = []

            # we need to iterate through each feature's unique values
            unique_values_for_feature = np.unique(data[:, feature])

            for index in range(len(unique_values_for_feature)):
                
                if index != 0:

                    # we need to partition the data, we need the midpoint between the unique values
                    current = unique_values_for_feature[index]
                    prev = unique_values_for_feature[index - 1]
                    midpoint = (current + prev) / 2

                    # for our dictionary each value should be a midpoint between the 
                    # unique values for that feature
                    potential_splits[feature].append(midpoint)

        # return dictionary
        return potential_splits

    
    def find_best_split(self, data, potential_splits):

        lowest_entropy = 9999

        # for each dictionary key
        for key in potential_splits:
            
            # for each value for that key
            for value in potential_splits[key]:

                # split our data into on that threshold (value)
                data_below_threshold, data_above_threshold = self.split_data(
                    data=data, 
                    split_feature=key,
                    split_threshold=value)
                
                # calculate entropy at this split
                entropy_for_this_split = self.overall_entropy(data_below_threshold, data_above_threshold)

                # if entropy at this split is lower than the lowest entropy found so far
                if entropy_for_this_split < lowest_entropy:

                    # the entropy at this split is now the lowest 
                    lowest_entropy = entropy_for_this_split

                    # keep a record of this key, value pair
                    best_split_feature = key
                    best_split_threshold = value

        # return the best potential split
        return best_split_feature, best_split_threshold


    def decision_tree(self, train_df, min_samples, max_depth, counter=0):

        '''only one arg needed (df). fitting this training df will account for
        splitting data into X and y'''

        # if this is our first potential split
        if counter == 0:

            # set this global variable so we can use it each time we call decision_tree()
            global feature_names
            feature_names = train_df.columns

            # get our df values
            data = train_df.values

        # if we have recursively reached this point
        else:

            # our 'train_df' is actually already an array upon recursion
            data = train_df
            
        # base case: if our impurity for a subtree is 0 or we have reached our 
        # maximum depth or min samples
        if (self.purity(data)) or (len(data) < min_samples) or (counter == max_depth):
            
            # at this point we'll have to make a judgment call and classify it based on the majority
            return self.make_classification(data)

        # if we haven't reach one of our stopping points
        else:

            # increment counter
            counter += 1

            # get a dictionary of our potential splits
            potential_splits = self.potential_splits(data)
            
            # find the best split
            split_feature, split_threshold = self.find_best_split(data, potential_splits)

            # get the data below and above
            data_below_threshold, data_above_threshold = self.split_data(data, split_feature, split_threshold)

            # store feature name as string
            feature_name_as_string = feature_names[split_feature]

            # feature_name <= threshold value
            split_question = f"{feature_name_as_string} <= {split_threshold}"

            # create a dictionary for these split_questions 
            subtree = {split_question: []}

            # recursion on our true values
            answer_true = self.decision_tree(data_below_threshold, min_samples, max_depth, counter)

            # recursion on our false values
            answer_false = self.decision_tree(data_above_threshold, min_samples, max_depth, counter)

            # if both answers are the same class
            if answer_true == answer_false:

                # choose one to be the subtree
                subtree = answer_true
            
            # if answers result in different class
            else:

                # append to dictionary
                subtree[split_question].append(answer_true)
                subtree[split_question].append(answer_false)
            
            return subtree

        
    def classify_observation(self, observation, tree):

        # store the current question 
        split_question = list(tree.keys())[0]

        # grab the feature name and value 
        feature_name, _, value = split_question.split()

        # if the row at that feature column is less than the threshold
        if observation[feature_name] <= float(value):

            # answer yes, it's under the threshold
            answer = tree[split_question][0]

        # if the row at that feature column has exceeded the threshold
        else:

            # answer no, it has exceeded the threshold
            answer = tree[split_question][1]
        

        # if the answer is not a dictionary
        if not isinstance(answer, dict):

            # return answer as it is a class label
            return answer

        # if the answer is a dictionary
        else:

            # recursion with the 'answer' subtree as the tree argument
            return self.classify_observation(observation, answer)

In [37]:
# Split our dataframe
train_df, test_df = train_test_split(df)
train_df.shape, test_df.shape

((121, 5), (30, 5))

In [38]:
# Instantiate the class
dtc = DecisionTreeClassifier(min_samples=2, max_depth=4)

In [39]:
# Fit a train df
dtc.fit(train_df)

<__main__.DecisionTreeClassifier at 0x118bc4590>

In [40]:
# Predict our test df values
dtc.predict(test_df)

Append predictions as new column in input dataframe? Y or N n


('Accuracy:', 0.9666666666666667)

In [41]:
# Check that test_df looks the same
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
92,5.8,2.6,4.0,1.2,Iris-versicolor
148,6.2,3.4,5.4,2.3,Iris-virginica
5,5.4,3.9,1.7,0.4,Iris-setosa
107,7.3,2.9,6.3,1.8,Iris-virginica
66,5.6,3.0,4.5,1.5,Iris-versicolor


In [42]:
# This time we want to append our predictions so we can look at our data better
dtc_with_appending = DecisionTreeClassifier(min_samples=2, max_depth=4)
dtc_with_appending.fit(train_df)
dtc_with_appending.predict(test_df)

Append predictions as new column in input dataframe? Y or N y


('Accuracy:', 0.9666666666666667)

In [43]:
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,predictions,correct_predictions
92,5.8,2.6,4.0,1.2,Iris-versicolor,Iris-versicolor,True
148,6.2,3.4,5.4,2.3,Iris-virginica,Iris-virginica,True
5,5.4,3.9,1.7,0.4,Iris-setosa,Iris-setosa,True
107,7.3,2.9,6.3,1.8,Iris-virginica,Iris-virginica,True
66,5.6,3.0,4.5,1.5,Iris-versicolor,Iris-versicolor,True
