# Decision Tree

## Dependencies :

**Importing dependencies :**

In [50]:
import numpy as np
import math
import csv

**Setting dataset paths :**

In [51]:
dTrainPath = "../datasets/mlLab/290524.csv"
dTestPath = "../datasets/mlLab/060624.csv"

In [52]:
import pandas as pd

dTrain = pd.read_csv(dTrainPath)
dTest = pd.read_csv(dTestPath)

print(f"Train set :\n{dTrain}\n\nTest set :\n{dTest}")

Train set :
   weather  temp humidity    wind play_tennis
0    sunny   hot     high    weak          no
1    sunny   hot     high  strong          no
2   cloudy   hot     high    weak         yes
3     rain  mild     high    weak         yes
4     rain  cool   normal    weak         yes
5     rain  cool   normal  strong          no
6   cloudy  cool   normal  strong         yes
7    sunny  mild     high    weak          no
8    sunny  cool   normal    weak         yes
9     rain  mild   normal    weak         yes
10   sunny  mild   normal  strong         yes
11  cloudy  mild     high  strong         yes
12  cloudy   hot   normal    weak         yes
13    rain  mild     high  strong          no

Test set :
  weather temp humidity    wind
0   sunny  hot     high    weak
1   sunny  hot     high  strong
2  cloudy  hot     high    weak


## Definitions :

**Class definition :**

In [53]:
class Node:
    def __init__(self,attribute):
        self.attribute = attribute
        self.children = []
        self.answer = None

**Data acquision method definitions :**

In [54]:
def read_data(filename):
    with open(filename,'r') as csvFile:
        dReader = csv.reader(csvFile,delimiter=',')
        headers = next(dReader)
        dTrain = [row for row in dReader]
    return headers, dTrain

In [55]:
def read_test_data(filename):
    with open(filename,'r') as csvFile:
        dReader = csv.reader(csvFile,delimiter=',')
        dTest = [row for row in dReader]
    return dTest

**Helper method defintions :**

In [56]:
def subtables(data, col, delete):
    dict = {}
    items = np.unique(data[:,col])
    for item in items:
        dict[item] = data[data[:,col]== item]
        if delete:
            dict[item] = np.delete(dict[item],col,axis=1)
    return items,dict           

**entropy and gain calculator method defintions :**

In [57]:
def entropy(S):
    items, counts = np.unique(S, return_counts=True)
    ps = counts/len(S)
    return - np.sum(ps * np.log2(ps))

In [58]:
def gain_ratio(data, col):
    eTotal = entropy(data[:,-1])
    items, dict = subtables(data, col, delete=False)

    eSubset = 0
    intrensicVal = 0
    totalSize = data.shape[0]

    for item in items:
        subset = dict[item]
        ratio = len(subset)/totalSize
        eSubset += ratio * entropy(subset[:,-1])
        intrensicVal -= ratio * math.log(ratio,2)

    if intrensicVal == 0 : return 0

    infoGain = eTotal - eSubset
    return infoGain/intrensicVal 

**Node creator method definition :**

In [59]:
def create_node(data, metadata):
    if len(np.unique(data[:,-1]))==1:
        node = Node(None)
        node.answer = np.unique(data[:,-1])[0]
        return node
    
    gains = [gain_ratio(data,col) for col in range(data.shape[1]-1)]
    split = np.argmax(gains)
    node = Node(metadata[split])
    items, dict = subtables(data,split,delete=True)

    for item in items:
        child = create_node(dict[item],np.delete(metadata,split))
        node.children.append((item,child))

    return node

**Predictor method defintion :**

In [60]:
def predict(node, instance, metadata):
    if node.answer is not None:
        return node.answer
    
    value = instance[metadata.index(node.attribute)]

    for item, child in node.children:
        if item == value:
            return predict(child, instance, metadata)

**Utility printing method definition :**

In [61]:
def empty(size):
    return "\t"*size

In [62]:
def print_tree(node, level):
    if node.answer is not None:
        print(empty(level),node.answer)
        return
    print(empty(level),node.attribute)
    for value, n in node.children:
        print(empty(level+1),value)
        print_tree(n,level+2)

In [63]:
def print_predictions(node, testdata, metadata):
    testInstance = testdata[1:]
    for instance in testInstance:
        prediction = predict(node, instance, metadata)
        print(f"Test instance : {instance}\nPredicted label : {prediction}")

In [64]:
metadata, dTrain = read_data(dTrainPath)
dTest = read_test_data(dTestPath)

data = np.array(dTrain)

node = create_node(data, metadata)

print("Decision Tree Structure : \n")
print_tree(node,0)

print("\nPredictions for test data")
print_predictions(node, dTest, metadata)

Decision Tree Structure : 

 weather
	 cloudy
		 yes
	 rain
		 wind
			 strong
				 no
			 weak
				 yes
	 sunny
		 humidity
			 high
				 no
			 normal
				 yes

Predictions for test data
Test instance : ['sunny', 'hot', 'high', 'weak']
Predicted label : no
Test instance : ['sunny', 'hot', 'high', 'strong']
Predicted label : no
Test instance : ['cloudy', 'hot', 'high', 'weak']
Predicted label : yes
