In [1]:
import numpy as np
from math import log2
from collections import defaultdict
import time

# class Node:
#     def __init__(self, emitter=None, split=None, roomlabel=None):
#         self.right = None
#         self.left = None
#         self.emitter = emitter
#         self.split = split
#         self.roomlabel = roomlabel

In [2]:
def entropy(roomData):
    H=0.0
    i=0
    scoreboard = {1:0, 2:0, 3:0, 4:0}
    size = len(roomData)
    while i<size:
        if roomData[i][-1] == 1:
            scoreboard[1]+=1
        elif roomData[i][-1] == 2:
            scoreboard[2]+=1
        elif roomData[i][-1] == 3:
            scoreboard[3]+=1
        elif roomData[i][-1] == 4:
            scoreboard[4]+=1
        i+=1;
    for i in (1,2,3,4):
        if scoreboard[i] > 0:
            H += ((-scoreboard[i])/size)*(log2(scoreboard[i]/size))
    return H

In [3]:
def InfoGain(All, Left, Right):
    H_SAll=entropy(All)
    Total=len(Left)+len(Right)
    Remainder=((len(Left)/Total)*entropy(Left))+((len(Right)/Total)*entropy(Right))
    Gains=H_SAll-Remainder
    return Gains

In [4]:
def FIND_SPLIT(d):
    emitter = 0
    value = 0
    max_info_gain = 0
    for x in range(len(d[0])-1):
        #d.sort(d,key = lambda y: y[x])
        ds = np.array(sorted(d, key = lambda y: y[x], reverse=True))
        #print(ds)
        for r in range(len(ds)):
            
            split_point = ds[r][x]
            if(r!=len(ds)-1 and ds[r+1][x] == split_point):
                continue
            info_gain = InfoGain(ds, ds[:r+1],ds[r+1:])
            
            if(info_gain > max_info_gain):
                emitter = x
                value = split_point
                max_info_gain = info_gain
            
            
        
    return emitter, value

In [5]:
def split(clean_rows):
    Left = []
    Right = []
    em, val = FIND_SPLIT(clean_rows) #splitvalue[0] = emitter and splitvalue[1] = value under that column
    
    for i in range(len(clean_rows)):
        if(clean_rows[i][em] >= val):
            Left.append(clean_rows[i])
        
        else:
            Right.append(clean_rows[i])
        
    
    
    return np.array(Left),np.array(Right),em,val

In [6]:
def tree_learning(training_data, depth):
    if(entropy(training_data) == 0 and len(training_data) != 0):
        label = training_data[0][-1]
        leaf = {'emitter':None, 'value':None, 'room': label, 'right':None, 'left':None}
        return leaf, depth
    else:
        ldata, rdata, em, val = split(training_data)
        root = {'emitter': em, 'value': val, 'room': None, 'right':None, 'left':None}
        root['left'], l_depth = tree_learning(ldata, depth+1)
        root['right'], r_depth = tree_learning(rdata, depth+1)
        return root, max(l_depth, r_depth)

In [7]:
# def tree_learning2(training_data):
#     if(entropy(training_data) == 0):
#         label = training_data[0][-1]
#         leaf = Node(roomlabel=label)
#         return leaf
#     else:
#         ldata, rdata, em, val = split(training_data)
#         root = Node(emitter=em, split=val)
#         root.left = tree_learning2(ldata)
#         root.right = tree_learning2(rdata)
#         return root

In [8]:
clean_rows = np.loadtxt("clean_dataset.txt")
noisy_rows = np.loadtxt("noisy_dataset.txt")
print(clean_rows)

[[-64. -56. -61. ... -82. -81.   1.]
 [-68. -57. -61. ... -85. -85.   1.]
 [-63. -60. -60. ... -85. -84.   1.]
 ...
 [-62. -59. -46. ... -87. -88.   4.]
 [-62. -58. -52. ... -90. -85.   4.]
 [-59. -50. -45. ... -88. -87.   4.]]


In [9]:
t, d = tree_learning(noisy_rows, 0)
#print(t)

In [10]:
def cross_validation(dataset):

#shuffle 
    np.random.shuffle(dataset)
#take a set
    fold_size = int(len(dataset)/10)
    test = dataset[0:fold_size]
    classes = set({})
    confusion_matrix_array = []
    
    
    for i in range(len(dataset)):
        classes.add(dataset[i][-1])
    
    for i in range(10):
        test = dataset[0+fold_size*i:fold_size*(i+1)]
        training = np.vstack((dataset[0:0+fold_size*i],dataset[fold_size*(i+1):]))
        root, depth = tree_learning(training, 0)
        #accuracy, confusion_matrix = evaluate(test, root)
        confusion_matrix = evaluate(test, root)
        confusion_matrix_array.append(confusion_matrix)
        print(accuracy(confusion_matrix))
        
    
    
        
#cycle validation

In [11]:
def evaluate(test, root): #why we do like this we dont need to
    #recurse on evaluate we can make a function to just return the leaf value and comapre in the for loop and form
    #the whole matrix here, the other function will just return our value
    
    arr = []
    
    for row in test:
        prediction = return_result(row, root)
        arr.append([row[-1],prediction])
        
    #print(arr)
    return confuse_matrix(arr)

In [12]:
def confuse_matrix(arr): 
    
    cm = np.zeros((4,4))
    
    for i in range (1,5):
        for pair in arr:
            if(i == pair[0] and pair[0] == pair[1]):
                    cm[i-1][i-1]+=1
            if(i == pair[0] and pair[1] != i):
                    cm[i-1][int(pair[1])-1]+=1
    return cm
    
#confuse_matrix([[1, 1], [2, 2], [3,3], [1,2], [1,3], [3,1], [4,2]])

In [13]:
def return_result(row, root):
    
    if(root['room']!= None):
        return root['room']
    else:
        if(row[root['emitter']]>=root['value']):
            return return_result(row, root['left'])
        else:
            return return_result(row, root['right'])

In [14]:
def accuracy(cm):
    return np.trace(cm)/np.sum(cm)

In [None]:


testing = np.loadtxt("clean_dataset.txt")
st=time.time()
cross_validation(testing)
et=time.time()
print('Runtime: ', et-st)

0.82
0.83
0.85
0.765
0.795
0.72
