In [1]:
from helperfunctions import train_test_split
from collections import Counter
import numpy as np

In [2]:
# Not allowed to use pandas
import pandas as pd
df = pd.read_csv("Iris.csv")
del pd

In [3]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# This will mess up the accuracy, so we can get rid of this
df.drop(columns='Id', inplace=True)

In [5]:
df.shape

(150, 5)

In [6]:
df.tail()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [9]:
class DecisionTreeClassifier:
    '''Can handle numeric features and dataframes that are free of missing data.
    If missing values, please impute first.'''

    def __init__(self, min_samples=2, max_depth=5, tree=None, append=None):
        
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.tree = None
        self.append = None
        
        
    def fit(self, train_df):
        
        # run method decision_tree and set it as the self.tree attribute
        self.tree = self.decision_tree(train_df, self.min_samples, self.max_depth)
        
        return self
    
    
    def predict(self, test_df):

        # if a tree has been fitted
        if self.tree:
            
            # store true labels for our test_df
            y_test_true = test_df.iloc[:, -1]
            
            while self.append not in ['Y', 'y', 'N', 'n']:
                
                # ask the user if they want the DTC's predictions appended
                self.append = input('Append predictions as new column in input dataframe? Y or N')
                
                # if they do 
                if self.append in ['Y', 'y']:
                    
                    continue
            
                # if they don't want their predictions appended
                elif self.append in ['N', 'n']:
                    
                    # create a copy of the test_df so results don't get appended
                    test_df = test_df.copy()
                    
                # if not a valid input  
                else:
                    
                    print("Not a valid input. Try again you silly goose.")
            
            # create a new column for our predictions
            test_df['predictions'] = test_df.apply(self.classify_observation, axis=1, args=(self.tree,))
            
            # return True or False where our predictions are equal to our labels
            test_df['correct_predictions'] = test_df['predictions'] == y_test_true

            # return how accurate the predictions are
            return ("Accuracy:", test_df['correct_predictions'].mean())
        
        # if a tree wasn't fitted
        else:
            print("NotFittedError: Please fit your decision tree first!")

    def entropy(self, data):

        # labels for input data
        labels = data[:,-1]

        # get unique labels along with their counts
        _, label_counts = np.unique(labels, return_counts=True)

        # array of probabilities of each label
        probabilities = label_counts / label_counts.sum()
        
        # return entropy
        return sum(probabilities * -np.log2(probabilities))
    

    def information_gain(self, data, data_below_threshold, data_above_threshold):
        
        parent_entropy = self.entropy(data)
        
        entropy_below_threshold = self.entropy(data_below_threshold)
        
        entropy_above_threshold = self.entropy(data_above_threshold)
        
        weight_below_threshold = len(data_below_threshold) / len(data)
        
        weight_above_threshold = len(data_above_threshold) / len(data)
        
        children_entropy = (weight_below_threshold * entropy_below_threshold) + (weight_above_threshold * entropy_above_threshold)
    
        # information gain
        return parent_entropy - children_entropy
    
    
    def purity(self, data):

        # last column of the df must be the labels!
        labels = data[:, -1]

        # if data has only one kind of label
        if len(np.unique(labels)) == 1:

            # it is pure
            return True

        # if data has a few different labels still
        else:

            # it isn't pure
            return False
        
    
    def make_classification(self, data):

        '''Once the max depth or min samples or purity is 1, 
        we classify the data with whatever the majority of the labels are'''

        # labels for input data
        labels = data[:,-1]

        # instantiate a Counter object on the labels
        counter = Counter(labels)

        # return the most common class/label
        return counter.most_common(1)[0][0]

    
    def split_data(self, data, split_feature, split_threshold):

        # array of only the split_feature
        feature_values = data[:,split_feature]

        # array where feature values do not exceed threshold, array where does exceed threshold
        return data[feature_values <= split_threshold], data[feature_values > split_threshold]


    def potential_splits(self, data):
        
        # dictionary of splits
        potential_splits = {}

        # store the number of features (not including labels/target)
        n_features = len(data[0])

        # for each feature in possible features
        for feature in range(n_features - 1):

            # for our dictionary, each feature should be a key
            potential_splits[feature] = []

            # we need to iterate through each feature's unique values
            unique_values_for_feature = np.unique(data[:, feature])

            for index in range(len(unique_values_for_feature)):
                
                if index != 0:

                    # we need to partition the data, we need the midpoint between the unique values
                    current = unique_values_for_feature[index]
                    prev = unique_values_for_feature[index - 1]
                    midpoint = (current + prev) / 2

                    # for our dictionary each value should be a midpoint between the 
                    # unique values for that feature
                    potential_splits[feature].append(midpoint)

        # return dictionary
        return potential_splits

    
    def find_best_split(self, data, potential_splits):

        best_info_gain = 0

        # for each dictionary key
        for key in potential_splits:
            
            # for each value for that key
            for value in potential_splits[key]:

                # split our data into on that threshold (value)
                data_below_threshold, data_above_threshold = self.split_data(
                    data=data, 
                    split_feature=key,
                    split_threshold=value)
                
                # calculate entropy at this split
                info_gain_for_this_split = self.information_gain(data, data_below_threshold, data_above_threshold)

                # if entropy at this split is lower than the lowest entropy found so far
                if info_gain_for_this_split > best_info_gain:

                    # the entropy at this split is now the lowest 
                    best_info_gain = info_gain_for_this_split

                    # keep a record of this key, value pair
                    best_split_feature = key
                    best_split_threshold = value

        # return the best potential split
        return best_split_feature, best_split_threshold


    def decision_tree(self, train_df, min_samples, max_depth, counter=0):

        '''only one arg needed (df). fitting this training df will account for
        splitting data into X and y'''

        # if this is our first potential split
        if counter == 0:

            # set this global variable so we can use it each time we call decision_tree()
            global feature_names
            feature_names = train_df.columns

            # get our df values
            data = train_df.values

        # if we have recursively reached this point
        else:

            # our 'train_df' is actually already an array upon recursion
            data = train_df
            
        # base case: if our impurity for a subtree is 0 or we have reached our 
        # maximum depth or min samples
        if (self.purity(data)) or (len(data) < min_samples) or (counter == max_depth):
            
            # at this point we'll have to make a judgment call and classify it based on the majority
            return self.make_classification(data)

        # if we haven't reach one of our stopping points
        else:

            # increment counter
            counter += 1

            # get a dictionary of our potential splits
            potential_splits = self.potential_splits(data)
            
            # find the best split
            split_feature, split_threshold = self.find_best_split(data, potential_splits)

            # get the data below and above
            data_below_threshold, data_above_threshold = self.split_data(data, split_feature, split_threshold)

            # store feature name as string
            feature_name_as_string = feature_names[split_feature]

            # feature_name <= threshold value
            split_question = f"{feature_name_as_string} <= {split_threshold}"

            # create a dictionary for these split_questions 
            subtree = {split_question: []}

            # recursion on our true values
            answer_true = self.decision_tree(data_below_threshold, min_samples, max_depth, counter)

            # recursion on our false values
            answer_false = self.decision_tree(data_above_threshold, min_samples, max_depth, counter)

            # if both answers are the same class
            if answer_true == answer_false:

                # choose one to be the subtree
                subtree = answer_true
            
            # if answers result in different class
            else:

                # append to dictionary
                subtree[split_question].append(answer_true)
                subtree[split_question].append(answer_false)
            
            return subtree
        
    def print_tree(self):
        
        return self.tree

        
    def classify_observation(self, observation, tree):

        # store the current question 
        split_question = list(tree.keys())[0]

        # grab the feature name and value 
        feature_name, _, value = split_question.split()

        # if the row at that feature column is less than the threshold
        if observation[feature_name] <= float(value):

            # answer yes, it's under the threshold
            answer = tree[split_question][0]

        # if the row at that feature column has exceeded the threshold
        else:

            # answer no, it has exceeded the threshold
            answer = tree[split_question][1]
        

        # if the answer is not a dictionary
        if not isinstance(answer, dict):

            # return answer as it is a class label
            return answer

        # if the answer is a dictionary
        else:

            # recursion with the 'answer' subtree as the tree argument
            return self.classify_observation(observation, answer)

In [10]:
# Split our dataframe
train_df, test_df = train_test_split(df)
train_df.shape, test_df.shape

((122, 5), (30, 5))

In [12]:
# Instantiate the class
dtc = DecisionTreeClassifier(min_samples=2, max_depth=4)

In [13]:
# Fit a train df
dtc.fit(train_df)

<__main__.DecisionTreeClassifier at 0x10e3afe90>

In [14]:
# Predict our test df values
dtc.predict(test_df)

Append predictions as new column in input dataframe? Y or N n


('Accuracy:', 0.9333333333333333)

In [19]:
# Check that test_df looks the same
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
144,6.7,3.3,5.7,2.5,Iris-virginica
4,5.0,3.6,1.4,0.2,Iris-setosa
32,5.2,4.1,1.5,0.1,Iris-setosa
50,7.0,3.2,4.7,1.4,Iris-versicolor
44,5.1,3.8,1.9,0.4,Iris-setosa


In [20]:
# This time we want to append our predictions so we can look at our data better
dtc_with_appending = DecisionTreeClassifier(min_samples=2, max_depth=4)
dtc_with_appending.fit(train_df)
dtc_with_appending.predict(test_df)

Append predictions as new column in input dataframe? Y or N y


('Accuracy:', 0.9666666666666667)

In [21]:
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,predictions,correct_predictions
144,6.7,3.3,5.7,2.5,Iris-virginica,Iris-virginica,True
4,5.0,3.6,1.4,0.2,Iris-setosa,Iris-setosa,True
32,5.2,4.1,1.5,0.1,Iris-setosa,Iris-setosa,True
50,7.0,3.2,4.7,1.4,Iris-versicolor,Iris-versicolor,True
44,5.1,3.8,1.9,0.4,Iris-setosa,Iris-setosa,True


In [112]:
import pandas as pd
dating = pd.read_csv('smallspeeddating.csv')

In [113]:
dating.head()

Unnamed: 0.1,Unnamed: 0,GENDER,How important is attractiveness to you?,How important is sincerity to you?,How important is intelligence to you?,Import_of_Fun,Import_of_Amb,Import_of_Shar,Exp_of_Attr,Exp_of_Sinc,Exp_of_Int,Exp_of_Fun,Exp_of_Amb,Exp_of_Shar
0,0,Female,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0
1,1,Female,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0
2,2,Female,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0
3,3,Female,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0
4,4,Female,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0


In [114]:
dating = dating[['How important is attractiveness to you?', 
                'How important is sincerity to you?',
                'How important is intelligence to you?',
                'Import_of_Fun',
                'Import_of_Amb', 
                'Import_of_Shar',
                 'Exp_of_Attr',
                 'Exp_of_Sinc',
                 'Exp_of_Int',
                 'Exp_of_Fun',
                 'Exp_of_Amb',
                 'Exp_of_Shar',
                 'GENDER'
                ]]

In [115]:
dating = dating.copy()

In [116]:
dating['GENDER'] = dating['GENDER'].map({'Female':0,'Male':1})

In [117]:
dating.rename(columns={'How important is attractiveness to you?':'Importance_of_Attractiveness',
                      'How important is sincerity to you?':'Importance_of_Sincerity',
                      'How important is intelligence to you?':'Importance_of_Intelligence',
                      'Import_of_Fun':'Importance_of_Fun',
                      "Import_of_Amb": 'Importance_of_Ambition',
                      "Exp_of_Attr": "Expectations_of_Attractiveness",
                      "Exp_of_Int":"Expectations_of_Intelligence",
                      "Exp_of_Shar": "Expectations_of_Shared_Interests"},
             inplace=True)

In [118]:
dating.head()

Unnamed: 0,Importance_of_Attractiveness,Importance_of_Sincerity,Importance_of_Intelligence,Importance_of_Fun,Importance_of_Ambition,Import_of_Shar,Expectations_of_Attractiveness,Exp_of_Sinc,Expectations_of_Intelligence,Exp_of_Fun,Exp_of_Amb,Expectations_of_Shared_Interests,GENDER
0,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,0
1,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,0
2,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,0
3,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,0
4,15.0,20.0,20.0,15.0,15.0,15.0,35.0,20.0,15.0,20.0,5.0,5.0,0


In [119]:
dating_train, dating_test = train_test_split(dating)

In [120]:
dating_train.shape, dating_test.shape

((5501, 13), (1340, 13))

In [121]:
dating_train.isna().sum()

Importance_of_Attractiveness        0
Importance_of_Sincerity             0
Importance_of_Intelligence          0
Importance_of_Fun                   0
Importance_of_Ambition              0
Import_of_Shar                      0
Expectations_of_Attractiveness      0
Exp_of_Sinc                         0
Expectations_of_Intelligence        0
Exp_of_Fun                          0
Exp_of_Amb                          0
Expectations_of_Shared_Interests    0
GENDER                              0
dtype: int64

In [127]:
dtc = DecisionTreeClassifier(max_depth=3)

In [128]:
dtc.fit(dating_train)

<__main__.DecisionTreeClassifier at 0x1184bedd0>

In [129]:
dtc.predict(dating_test)

Append predictions as new column in input dataframe? Y or N n


('Accuracy:', 0.8380597014925373)

In [130]:
dtc.print_tree()

{'Importance_of_Attractiveness <= 16.835': [{'Expectations_of_Attractiveness <= 27.5': [{'Importance_of_Intelligence <= 27.5': [0.0,
      1.0]},
    {'Expectations_of_Intelligence <= 2.5': [1.0, 0.0]}]},
  {'Expectations_of_Attractiveness <= 27.5': [1.0,
    {'Importance_of_Attractiveness <= 26.0': [0.0, 1.0]}]}]}

In [138]:
dating_test.columns

Index(['Importance_of_Attractiveness', 'Importance_of_Sincerity',
       'Importance_of_Intelligence', 'Importance_of_Fun',
       'Importance_of_Ambition', 'Import_of_Shar',
       'Expectations_of_Attractiveness', 'Exp_of_Sinc',
       'Expectations_of_Intelligence', 'Exp_of_Fun', 'Exp_of_Amb',
       'Expectations_of_Shared_Interests', 'GENDER'],
      dtype='object')

In [142]:
dating_test.rename(columns={"GENDER": "gender"}, inplace=True)

In [144]:
dating_test['gender'] = dating_test['gender'].map({0:'Female', 1:'Male'})

In [149]:
dating_test[["Importance_of_Attractiveness", "Importance_of_Sincerity", "gender"]].tail(10)

Unnamed: 0,Importance_of_Attractiveness,Importance_of_Sincerity,gender
2998,10.0,47.0,Female
4402,20.0,20.0,Female
865,17.0,17.0,Female
4062,25.0,20.0,Male
6317,23.0,14.0,Male
1862,20.0,20.0,Female
6370,50.0,0.0,Male
5214,20.0,20.0,Male
4000,50.0,10.0,Male
6505,15.0,25.0,Male
