In [22]:
import math
import csv
import pandas as pd
import math
import numpy as np



def load_csv(filename):
#     data = pd.read_csv(filename,)
#     dataset=list(data)
#     headers=list(data.columns)
    
    lines = csv.reader(open(filename, "r"));
    dataset = list(lines)
    headers = dataset.pop(0)
    
    return dataset, headers

class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = ""  # NULL indicates children exists. # Not Null indicates this is a Leaf Node

def subtables(data, col, delete): 
    dic = {}
    coldata = [ row[col] for row in data]
    attr = list(set(coldata)) # All values of attribute retrived
    for k in attr:
        dic[k] = []

    for y in range(len(data)):
        key = data[y][col]
        if delete:
            del data[y][col]
        dic[key].append(data[y])
    return attr, dic

def entropy(S):
    attr = list(set(S))
    if len(attr) == 1: #if all are +ve/-ve then entropy = 0 
        return 0

    counts = [0,0] # Only two values possible 'yes' or 'no' 
    for i in range(2):
        counts[i] = sum( [1 for x in S if attr[i] == x] ) / (len(S) * 1.0)

    sums = 0
    for cnt in counts:
        sums += -1 * cnt * math.log(cnt, 2)
    return sums

def compute_gain(data, col):
    attValues, dic = subtables(data, col, delete=False)
    total_entropy = entropy([row[-1] for row in data])
    for x in range(len(attValues)):
        ratio = len(dic[attValues[x]]) / ( len(data) * 1.0)
        entro = entropy([row[-1] for row in dic[attValues[x]]]) 
        total_entropy -= ratio*entro

    return total_entropy
 
def build_tree(data, features):
    lastcol = [row[-1] for row in data]
   
    if (len(set(lastcol))) == 1: # If all samples have same labels return that label
        node=Node("")
        node.answer = lastcol[0]
        return node

    n = len(data[0])-1
    gains = [compute_gain(data, col) for col in range(n) ]

    split = gains.index(max(gains)) # Find max gains and returns index 
    node = Node(features[split]) # 'node' stores attribute selected #del (features[split])
    fea = features[:split]+features[split+1:]

    attr, dic = subtables(data, split, delete=True) # Data will be spilt in subtables 
    for x in range(len(attr)):
        child = build_tree(dic[attr[x]], fea) 
        node.children.append((attr[x], child))

    return node

def print_tree(node, level):
    if node.answer != "":
        print("     "*level, node.answer) # Displays leaf node yes/no 
        return

    print("       "*level, node.attribute) # Displays attribute Name 
    for value, n in node.children:
        print("     "*(level+1), value) 
        print_tree(n, level + 2)

def classify(node,x_test,features): 
    if node.answer != "":
        print(node.answer) 
        return

    pos = features.index(node.attribute)
    for value, n in node.children:
        if x_test[pos]==value: 
            classify(n,x_test,features)

#''' Main program '''
dataset, features = load_csv("PlayTennis.csv")
# Read Tennis data 
node = build_tree(dataset, features)


print("The decision tree for the dataset using ID3 algorithm is ") 
print_tree(node, 0)

testdata, features = load_csv("PlayTennisTestData.csv") 
for xtest in testdata:
    print("The test instance : ",xtest) 
    print("The predicted label : ", end="") 
    classify(node,xtest,features)


The decision tree for the dataset using ID3 algorithm is 
 Outlook
      rain
               Wind
                strong
                             Temperature
                          cool
                               no 
                          mild
                               no
                weak
                     yes 
      sunny
               Temperature
                cool
                     yes 
                hot
                     no
                mild
                             Humidity
                          high
                               no 
                          normal
                               yes 
      overcast
           yes 
The test instance :  ['sunny', 'cool', 'normal', 'weak ']
The predicted label : yes 
The test instance :  ['sunny', 'mild', 'normal', 'strong']
The predicted label : yes 
The test instance :  ['overcast', 'mild', 'high', 'strong ']
The predicted label : yes 
The test instance :  ['overcast', 'hot', 'norm

In [4]:
import pandas as pd
import math
import numpy as np

data = pd.read_csv("prog3_data.csv")
features = [feat for feat in data]
# print(features)
features.remove(features[-1])


class Node:
    def __init__(self):
        self.children = []
        self.value = ""
        self.isLeaf = False
        self.pred = ""


def entropy(examples):
    pos = 0.0
    neg = 0.0
    for _, row in examples.iterrows():
        if row["answer"] == "yes":
            pos += 1
        else:
            neg += 1
    if pos == 0.0 or neg == 0.0:
        return 0.0
    else:
        p = pos / (pos + neg)
        n = neg / (pos + neg)
        return -(p * math.log(p, 2) + n * math.log(n, 2))


def info_gain(examples, attr):
    uniq = np.unique(examples[attr])
    # print ("\n",uniq)
    gain = entropy(examples)
    # print ("\n",gain)
    for u in uniq:
        subdata = examples[examples[attr] == u]
        # print ("\n",subdata)
        sub_e = entropy(subdata)
        gain -= (float(len(subdata)) / float(len(examples))) * sub_e
        # print ("\n",gain)
    return gain


def ID3(examples, attrs):
    root = Node()

    max_gain = 0
    max_feat = ""
    for feature in attrs:
        # print ("\n",examples)
        gain = info_gain(examples, feature)
        if gain > max_gain:
            max_gain = gain
            max_feat = feature
    root.value = max_feat
    # print ("\nMax feature attr",max_feat)
    uniq = np.unique(examples[max_feat])
    # print ("\n",uniq)
    for u in uniq:
        # print ("\n",u)
        subdata = examples[examples[max_feat] == u]
        # print ("\n",subdata)
        if entropy(subdata) == 0.0:
            newNode = Node()
            newNode.isLeaf = True
            newNode.value = u
            newNode.pred = np.unique(subdata["answer"])
            root.children.append(newNode)
        else:
            dummyNode = Node()
            dummyNode.value = u
            new_attrs = attrs.copy()
            new_attrs.remove(max_feat)
            child = ID3(subdata, new_attrs)
            dummyNode.children.append(child)
            root.children.append(dummyNode)
    return root


def printTree(root: Node, depth=0):
    for i in range(depth):
        print("\t", end="")
    print(root.value, end="")
    if root.isLeaf:
        print(" -> ", root.pred)
    print()
    for child in root.children:
        printTree(child, depth + 1)


root = ID3(data, features)
printTree(root)

outlook
	overcast ->  ['yes']

	rain
		wind
			strong ->  ['no']

			weak ->  ['yes']

	sunny
		humidity
			high ->  ['no']

			normal ->  ['yes']

