In [33]:
import pandas as pd

In [34]:
df = pd.read_csv("dataset.csv")
print(df)

   weather temperature humidity    wind  play football
0    sunny         hot     high    weak             no
1    sunny         hot     high  strong             no
2   cloudy         hot     high    weak            yes
3     rain        mild     high    weak            yes
4     rain        cool   normal    weak            yes
5     rain        cool   normal  strong             no
6   cloudy        cool   normal  strong            yes
7    sunny        mild     high    weak             no
8    sunny        cool   normal    weak            yes
9     rain        mild   normal    weak            yes
10   sunny        mild   normal  strong            yes
11  cloudy        mild     high  strong            yes
12  cloudy         hot   normal    weak            yes
13    rain        mild     high  strong             no


In [35]:
import numpy as np
import math
import csv

In [36]:
class Node:
    def __init__(self,attribute):
        self.attribute = attribute
        self.children  = []
        self.answer = None
        

In [37]:
#def read_data(filename):
  #  with open(filename,'r') as csvfile:
   #     datareader = csv.reader(csvfile,delimiter=',')
    #    headers = next(datareader)
   #     traindata = [row for row in datareader]
   # return headers,traindata

In [38]:
#def read_test_data(filename):
   # with open(filename,'r') as csvfile:
      #  datareader = csv.reader(csvfile,delimiter=',')
      #  testdata = [row for row in datareader]
   # return testdata

In [39]:
def subtables(data,col,delete):
    dict = {}
    items = np.unique(data[:,col])
    for item in items:
        dict[item] = data[data[:,col]==item]
        if delete:
            dict[item] = np.delete(dict[item],col,axis = 1)
    return items,dict

In [40]:
def entropy(S):
    items,counts = np.unique(S,return_counts = True)
    probabilities = counts/len(S)
    return -np.sum(probabilities*np.log2(probabilities))

In [41]:
def gain_ratio(data,col):
    total_entropy = entropy(data[:,-1])
    items,dict = subtables(data,col,delete=False)
    subset_entropy = 0
    intrinsic_value = 0
    total_size = data.shape[0]
    for item in items:
        subset = dict[item]
        ratio = len(subset)/total_size
        subset_entropy += ratio * entropy(subset[:,-1])
        intrinsic_value -= ratio * math.log(ratio,2)
    if intrinsic_value == 0:
        return 0
    information_gain = total_entropy - subset_entropy
    return information_gain/intrinsic_value




In [42]:
def create_node(data,metadata):
    if len(np.unique(data[:,-1])) == 1:
        node = Node(None)
        node.answer = np.unique(data[:,-1])[0]
        return node
    gains = [gain_ratio(data,col) for col in range(data.shape[1] -1)]
    split = np.argmax(gains)
    node = Node(metadata[split])
    items,dict = subtables(data,split,delete = True)
    for item in items:
        child = create_node(dict[item],np.delete(metadata,split))
        node.children.append((item,child))
    return node

In [43]:
def predict(node,instance,metadata):
    if node.answer is not None:
        return node.answer
    value = instance[metadata.index(node.attribute)]
    for item,child in node.children:
        if item == value:
            return predict(child,instance,metadata)

In [44]:
def empty(size):
    return "   " * size
def print_tree(node,level):
    if node.answer is not None:
        print(empty(level),node.answer)
        return
    print(empty(level),node.attribute)
    for value,n in node.children:
        print(empty(level+1),value)
        print_tree(n,level + 2)


In [45]:
def print_predictions(node,testdata,metadata):
    test_instances = testdata[1:]
    for instance in test_instances:
        prediction = predict(node,instance,metadata)
        print(f"The test instance:{instance}")
        print(f"The predicted label:{prediction}")

In [46]:
metadata,traindata = read_data("dataset.csv")
test_data = read_test_data("test.csv")
data = np.array(traindata)
node = create_node(data,metadata)
print("Decision tree structure")
print_tree(node,0)
print("\nPredictions for test data")
print_predictions(node,test_data,metadata)

Decision tree structure
 weather
    cloudy
       yes
    rain
       wind
          strong
             no
          weak
             yes
    sunny
       humidity
          high
             no
          normal
             yes

Predictions for test data
The test instance:['sunny', 'hot', 'high', 'weak', '']
The predicted label:no
The test instance:['sunny', 'hot', 'high', 'strong', '']
The predicted label:no
The test instance:['cloudy', 'hot', 'high', 'weak', '']
The predicted label:yes


In [47]:
print(metadata)

['weather', 'temperature', 'humidity', 'wind', ' play football']


In [48]:
print(traindata)

[['sunny', 'hot', 'high', 'weak', 'no'], ['sunny', 'hot', 'high', 'strong', 'no'], ['cloudy', 'hot', 'high', 'weak', 'yes'], ['rain', 'mild', 'high', 'weak', 'yes'], ['rain', 'cool', 'normal', 'weak', 'yes'], ['rain', 'cool', 'normal', 'strong', 'no'], ['cloudy', 'cool', 'normal', 'strong', 'yes'], ['sunny', 'mild', 'high', 'weak', 'no'], ['sunny', 'cool', 'normal', 'weak', 'yes'], ['rain', 'mild', 'normal', 'weak', 'yes'], ['sunny', 'mild', 'normal', 'strong', 'yes'], ['cloudy', 'mild', 'high', 'strong', 'yes'], ['cloudy', 'hot', 'normal', 'weak', 'yes'], ['rain', 'mild', 'high', 'strong', 'no']]
