In [2]:
import numpy as np
import pandas as pd

from typing import List, Dict
from anytree import NodeMixin, RenderTree
from anytree.search import find_by_attr, findall_by_attr
from math import log2


In [3]:
data  = pd.read_csv("data.csv")
data

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [14]:
class DecTreeNode(NodeMixin):
    def __init__(self, name: str, attribute: str, parent: DecTreeNode = None):
        super(DecTreeNode, self).__init__()
        self.name = name
        self.attribute = attribute
        self.parent = parent
        self.attr_value = "{0}={1}".format(attribute, name)

In [15]:
class DecTree():
    
    def __init__(self, data: pd.DataFrame, target_attribute: str):
        self.data = data
        self.target_attr = target_attribute
        self.target_attr_vals = data[target_attribute].unique()
        self.root_node = None
        
    def pmf_target(self, df: pd.DataFrame) -> Dict[str, float]:
        # Add your code
        pmf = dict()
        #print (df['play'])
        #arr , value = np[self.target_attr].unique(df, return_counts = True)
        arr , value = np.unique(df, return_counts = True)
        prob = np.array(value)/sum(value)
        pmf = dict (zip(arr,prob)) 
        return pmf
    
    def entropy(self, pmf: Dict[str, float]) -> float:
        # Add your code
        # taking the key values from dictionaty and converting it to numpy array
        perc = np.array(list(pmf.values()))
        perc[perc == 0] = 1
        #print ("value in Entropy = ", perc)
        #taking logs of all array using numpy.log2() method
        log = np.log2(perc)
        #individual intropy of all the dictionay values are calculated using numpy
        entr = perc * log
        entr[entr == np.nan] = 0
        #print ("entropy list = ", entr)
        #inverting the -ve values and summing them on return call
        return -entr.sum()

    def cal_entropy_df(self, df: pd.DataFrame) -> float:
        # Add your code
        # the data frame will be of two columns only. so that the entropy of an 
        # attribute agains its predecting attribute will be calculated properly
        # column_0 ==> predective attribute ;;;; column_1 = given attribute
        columns = df.columns
        
        # seperate the unique values and there count in both columns of dataframe
        uniq_wor_0, uniq_count_0  = np.unique(df[columns[0]], return_counts = True)
        uniq_wor_1, uniq_count_1  = np.unique(df[columns[1]], return_counts = True)
        print (uniq_wor_0,uniq_count_0,'\n...\n',uniq_wor_1,uniq_count_1)      # test printing
        
        # sepertion o each element and creation of array for entropy calculation
        entr_arr = np.array([])
        
        # checking the positive of pridection class with given atteibute
        for i in uniq_wor_1:
            print (i)
            arry_1 = (df[columns[0]] == uniq_wor_0[1])   # array of yes == true in play
            arry_2 = (df[columns[1]] == i)
            #print (arry_1.to_numpy().T)
            #print (arry_2.to_numpy().T)
            # getting the array where both atteibutes are positive
            result = np.logical_and(arry_1.to_numpy(), arry_2.to_numpy())
            #print (result.T)
            #print (result.sum())
            #print (arry_2.sum())
            #calculating the gini index of positives 
            result1 = (result.sum())/(arry_2.sum())
            #print (result)
            #temp_arr = np.append(temp_arr,result1)
            # making the dictionary of unique words and passing in to class function to calculate entropy
            prob_dic =  {uniq_wor_0[1]:(result1) , uniq_wor_0[0]:(1-result1) }
            ans = self.entropy (prob_dic)
            #normalizing the entropy by the weights of value
            norm_ent = (sum (arry_2)/len(arry_2)) * ans
            ##
            ## this array stores the value of entropies of all unique values with their relitive weights
            entr_arr = np.append(entr_arr, norm_ent)
            
            #print ("prob_dic =====>  ", prob_dic," === ans ===>", ans,"=== entr_arr ===>", entr_arr )
            
            #################################################
        
        print ("entr_arr == > ", entr_arr)
        
        dep_entr = entr_arr[:]
        # creating the data reame for primary key and passing it for praimary entropy 
        prim_att_df = pd.DataFrame()
        prim_att_df [self.target_attr] = df [self.target_attr]
        #print ("prim_att_df------->", prim_att_df)
        #prim_attr_dic = self.pmf_target(pd.DataFrame(prim_att_df))
        prim_attr_dic = self.pmf_target(prim_att_df)
        entp_prim = self.entropy(prim_attr_dic)
        
        #print ("prim_attr_dic ", prim_attr_dic, " entp_prim ", entp_prim)
        gain = entp_prim - sum(dep_entr)
        
        #print ("prim_attr_dic ", prim_attr_dic, "\n entp_prim ", entp_prim, "\n gain ", gain)
        
        #print ("-------------------\n")
        entr = gain
        return entr
    
    
    def info_gain_attribute(self, df: pd.DataFrame, attribute: str) -> float:
        # Add your code
        gain_info = np.array([])
        max_val = 0
        col = 'null'
        #going to every column and calculating the gain of each each attribute
        for i in attribute:
            df_col = pd.DataFrame()
            df_col[self.target_attr] = df[self.target_attr]
            df_col[i] = df[i]
            ans = self.cal_entropy_df (df_col)
            print("ans == > ", ans, type(ans))
            if (max_val < ans):
                max_val = ans
                print("max_val == > ", max_val)
                col = i
            gain_info = np.append(gain_info, ans)
        
        print ("gain_info--=>", gain_info)
        information_gain = col
        return information_gain
     
        
    def max_info_gain_attribute(self, df: pd.DataFrame) -> str:        
        # Add your code
        columns = df.columns
        columns = columns.drop(self.target_attr)
        gain_of_all = self.info_gain_attribute(df, columns)
        #max = gain_of_all.max()
        #print ("max ====> ",max_attr)
        print ("gain_of_all ===> ",gain_of_all, type (gain_of_all))
        max_arr = gain_of_all
        return max_arr    
    
    def build_tree_infgain(self, df: pd.DataFrame, attr_list: List[str], start_node: DecTreeNode):
        # Add your code
        # Use return to end current recursive call when any stopping condition is reached
        return
    
    def generate_tree(self):
        attributes = self.data.columns.to_list()
        attributes.remove(self.target_attr)
        
        start_node = DecTreeNode("start", "start")
        self.build_tree_infgain(self.data, attributes, start_node)
        
        self.root_node = start_node
    
    def print_tree(self):
        for pre, _, node in RenderTree(self.root_node):
            print(f"{pre} {node.attribute}={node.name}")
            
    def predict(self, X: pd.DataFrame) -> List[str]:
        # Add your code
        return predictions
            
    

        
        
        
dec_tree = DecTree(data , 'play')

df = pd.DataFrame()

dec_tree.max_info_gain_attribute(data)

['no' 'yes'] [5 9] 
...
 ['overcast' 'rainy' 'sunny'] [4 5 5]
overcast
rainy
sunny
entr_arr == >  [-0.          0.34676807  0.34676807]
ans == >  0.24674981977443933 <class 'numpy.float64'>
max_val == >  0.24674981977443933
['no' 'yes'] [5 9] 
...
 ['cool' 'hot' 'mild'] [4 4 6]
cool
hot
mild
entr_arr == >  [0.23179375 0.28571429 0.39355536]
ans == >  0.02922256565895487 <class 'numpy.float64'>
['no' 'yes'] [5 9] 
...
 ['high' 'normal'] [7 7]
high
normal
entr_arr == >  [0.49261407 0.29583639]
ans == >  0.15183550136234159 <class 'numpy.float64'>
['no' 'yes'] [5 9] 
...
 [False  True] [8 6]
False
True
entr_arr == >  [0.4635875  0.42857143]
ans == >  0.04812703040826949 <class 'numpy.float64'>
gain_info--=> [0.24674982 0.02922257 0.1518355  0.04812703]
gain_of_all ===>  outlook <class 'str'>


'outlook'

In [11]:
arr = data['play'].to_numpy()
array, value = np.unique(arr,return_counts = True)

In [26]:
d = dict(zip(array, value))

d

{'no': 5, 'yes': 9}

In [17]:
start_node = DecTreeNode("start", "start", parent=None)
start_node

<__main__.DecTreeNode at 0x1bfb0f40d30>

In [88]:
x = np.array([.01,2,3,4,5])
y = np.log2(x)
(y)

array([-6.64385619,  1.        ,  1.5849625 ,  2.        ,  2.32192809])

In [169]:
df = pd.DataFrame()
df['outlook'] = data['outlook']
df['play'] = data['play']
arr1 = (df['outlook'] == 'sunny')
arr2 =  (df['play'] == 'yes')
print (arr1.to_numpy().T)
print (arr2.to_numpy().T)
print(arr1.to_numpy() & arr2.to_numpy())

#df['play'] == 'yes'
#((df['temp'] == 'hot') and (df['play'] == 'yes'))

[ True  True False False False False False  True  True False  True False
 False False]
[False False  True  True  True False  True False  True  True  True  True
  True False]
[False False False False False False False False  True False  True False
 False False]


In [5]:
#the class is initilized and then i have pass pmf_targetdf
dectree = DecTree(data , 'play')

df = pd.DataFrame()
df['temp'] = data['temp']
df['play'] = data['play']
df
dectree.pmf_target(df)

{'cool': 0.14285714285714285,
 'hot': 0.14285714285714285,
 'mild': 0.21428571428571427,
 'no': 0.17857142857142858,
 'yes': 0.32142857142857145}

In [6]:
dectree = DecTree(data , 'play')

df = pd.DataFrame()
#df['temp'] = data['temp']
df['play'] = data['play']
per_dict = dectree.pmf_target(df)

dectree.entropy(per_dict)

0.9402859586706311