# Decision Tree
July 26, 2021 

## Load the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("golf_data.csv")
print(data.shape)
data

(14, 5)


Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


In [3]:
X = data.iloc[:, :-1]
X

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,Rainy,Hot,High,False
1,Rainy,Hot,High,True
2,Overcast,Hot,High,False
3,Sunny,Mild,High,False
4,Sunny,Cool,Normal,False
5,Sunny,Cool,Normal,True
6,Overcast,Cool,Normal,True
7,Rainy,Mild,High,False
8,Rainy,Cool,Normal,False
9,Sunny,Mild,Normal,False


In [4]:
y = data["Play Golf"]
y

0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: Play Golf, dtype: object

In [66]:
class Node:

    def __init__(self):
        self.feature = None
        self.children = {}
        self.X = None
        self.y = None
        self.leaf = False

    def predict(self):
        uni, count = np.unique(self.y.values, return_counts=True)

        p = count/self.y.shape[0]
        ix = np.argmax(count)
        d = {k:v for k,v in zip(uni,p)}
        return d

In [67]:
class DecisionTree:
    def __init__(self):
        self.root = None

    def entropy(self, y):
        uni, count = np.unique(y, return_counts=True)
        p = count/y.shape[0] # probability of each category
        ent = -( p*np.log2(p) ).sum()
        return ent
    
    def feature_entropy(self, feature, X, y):
        uni, count = np.unique(X[feature], return_counts=True)
        w = count/y.shape[0]
        ent = 0
        for i,val in enumerate(uni):
            ent += w[i]*self.entropy(y[ X[feature] == val ])
        return ent

    def split_data(self, feature, X, y):
        children = {}
        unique = np.unique(X[feature])

        for val in unique:
            child_node = Node()
            # child_node.feature = val
            child_node.X = X[ X[feature]==val ].drop([feature], axis=1)
            child_node.y = y[ X[feature]==val ]
            children[val] = child_node
        return children

    
    def make_tree(self, node, X, y):
        ent_before_split = self.entropy(y)
        features = X.columns

        # compute info gain for every feature
        info_gain = []
        for feature in features:
            ig = ent_before_split - self.feature_entropy(feature, X, y)
            info_gain.append(ig)

        # best feature
        ix = np.argmax(info_gain)
        
        best_feature = features[ix]

        if info_gain[ix] > 0:
            node.feature = best_feature
            # node.X = X
            # node.y = y
            # split data across unique value of the best feature
            node.children = self.split_data(best_feature, X, y)
            for child in node.children.values():
                self.make_tree(child, child.X, child.y)
            return 
        else:
            node.leaf = True
            node.X = X
            node.y = y
            return 

    def fit(self, X, y):
        self.root = Node()
        self.make_tree(self.root, X, y)

    def predict_recursive(self, node, X):
        if node.leaf:
            return node.predict()
        
        val = X[node.feature]
        return self.predict_recursive(node.children[val], X)

    def predict(self, X):
        return self.predict_recursive(self.root, X)

    def print_tree_recursive(self, node, indent):
        if node.leaf:
            # print(indent+str(node.feature), "-->", node.predict())
            print(node.predict())
            return
        print(indent+str(node.feature), "-->")
        for f_val, child in node.children.items():
            print(indent+str(f_val), "-->", end=" ")
            self.print_tree_recursive(child, indent+"\t")
            
    def print_tree(self):
        self.print_tree_recursive(self.root, indent="")


In [33]:

np.unique(X["Outlook"])
data[ X["Outlook"]=="Rainy"].drop(["Outlook"], axis=1).reset_index(drop=True)

Unnamed: 0,Temperature,Humidity,Windy,Play Golf
0,Hot,High,False,No
1,Hot,High,True,No
2,Mild,High,False,No
3,Cool,Normal,False,Yes
4,Mild,Normal,True,Yes


In [12]:
X_train = X.iloc[:10, :]
y_train = y[:10]
print(X_train.shape, y_train.shape)
# dt.fit()

(10, 4) (10,)


In [15]:
# y_train
X_train

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,Rainy,Hot,High,False
1,Rainy,Hot,High,True
2,Overcast,Hot,High,False
3,Sunny,Mild,High,False
4,Sunny,Cool,Normal,False
5,Sunny,Cool,Normal,True
6,Overcast,Cool,Normal,True
7,Rainy,Mild,High,False
8,Rainy,Cool,Normal,False
9,Sunny,Mild,Normal,False


In [68]:
# train the dt
dt = DecisionTree()
dt.fit(X_train, y_train)
dt.print_tree()

Outlook -->
Overcast --> {'Yes': 1.0}
Rainy --> 	Temperature -->
	Cool --> {'Yes': 1.0}
	Hot --> {'No': 1.0}
	Mild --> {'No': 1.0}
Sunny --> 	Windy -->
	False --> {'Yes': 1.0}
	True --> {'No': 1.0}


In [72]:
for i in range(X_train.shape[0]):
    x = X_train.iloc[i,:]
    print(dt.predict(x), y_train[i] )


{'No': 1.0} No
{'No': 1.0} No
{'Yes': 1.0} Yes
{'Yes': 1.0} Yes
{'Yes': 1.0} Yes
{'No': 1.0} No
{'Yes': 1.0} Yes
{'No': 1.0} No
{'Yes': 1.0} Yes
{'Yes': 1.0} Yes


In [73]:
X_test = X.iloc[10:, :]
y_test = y[10:]
print(X_test.shape, y_test.shape)

(4, 4) (4,)


In [76]:
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [77]:
# test data
for i in range(X_test.shape[0]):
    x = X_test.iloc[i,:]
    print(dt.predict(x), y_test[i] )

{'No': 1.0} Yes
{'Yes': 1.0} Yes
{'Yes': 1.0} Yes
{'No': 1.0} No


In [8]:
a = [10, 20 ,30 , 40]
for i, val in enumerate(a):
    print(i, val)

for i,char in enumerate("string"):
    print(i, char)

0 10
1 20
2 30
3 40
0 s
1 t
2 r
3 i
4 n
5 g


In [9]:
X.columns

Index(['Outlook', 'Temperature', 'Humidity', 'Windy'], dtype='object')