In [11]:
'''
Assume df is a pandas dataframe object of the dataset given
'''
import numpy as np
import pandas as pd
import random

#to get all the values in the last column of the data frame

def get_all_values_of_final_output(df):
    s=set()
    for i in df.iloc[:,-1]:
        if i not in s:
            s.add(i)
    return s

'''Calculate the entropy of the enitre dataset'''
	#input:pandas_dataframe
	#output:int/float/double/large

#Formula -> For all outputs present in the last column of the dataframe
# apply -(k*log2(k))
# where k=Count of each output/length of data frame
def get_entropy_of_dataset(df):
    entropy = 0
    di={}
    total=0
    for i in df.iloc[:,-1]:#all the rows for the last column in the dataframe
        if i not in di:
            di[i]=0
        di[i]+=1
        total+=1
    if total!=0:
        for i in di:
            k=di[i]/total
            if k!=0:
                entropy+=-((k)*np.log2(k))          
    return entropy

'''Return entropy of the attribute provided as parameter'''
	#input:pandas_dataframe,str   {i.e the column name ,ex: Temperature in the Play tennis dataset}
	#output:int/float/double/large

#Formula -> For all outputs present in that attribute's column of the dataframe
#We need to get the ratio of how many of these O/Ps provide each of the outputs
#of the last column and for each of the outputs of the last columns get pilog2(pi)
#if pi=0 then skip it.
#average info of an attribute might be entropy and you multiply the previous
#step answer to (total of current output attribute)/(total # of records in the data frame)
def get_entropy_of_attribute(df,attribute):
    
    entropy_of_attribute = 0
    di={}
    totalfinal=0
    k=df[attribute]
    s=get_all_values_of_final_output(df)
    for i in range(len(k)):
        if k[i] not in di:
            di[k[i]]={}
            for all_values in s:
                di[k[i]][all_values]=0                
        di[k[i]][df.iloc[i][-1]]+=1
        totalfinal+=1
    for i in di:
        total=0
        for j in di[i]:
            total+=di[i][j]
        for j in s:
            if di[i][j]!=0:
                k=di[i][j]/total
                entropy_of_attribute+=-(k)*np.log2(k)*(total/totalfinal)
    return abs(entropy_of_attribute)

'''Return Information Gain of the attribute provided as parameter'''
	#input:int/float/double/large,int/float/double/large
	#output:int/float/double/large

def get_information_gain(df,attribute):
	information_gain = 0
	datasetentropy=get_entropy_of_dataset(df)
	attributeentropy=get_entropy_of_attribute(df,attribute)
	information_gain=abs(datasetentropy-attributeentropy)
	return information_gain

''' Returns Attribute with highest info gain'''  
	#input: pandas_dataframe
	#output: ({dict},'str')

def get_selected_attribute(df):
    information_gains={}
    selected_column=''
    maxi=-1 #since no info gain can ever be negative
    for attribute in df.columns[:-1]:
        information_gains[attribute]=get_information_gain(df,attribute)
        if information_gains[attribute]>maxi:
            maxi=information_gains[attribute]
            selected_column=attribute
    return (information_gains,selected_column)  

    '''
	Return a tuple with the first element as a dictionary which has IG of all columns 
	and the second element as a string with the name of the column selected

	example : ({'A':0.123,'B':0.768,'C':1.23} , 'C')
    '''
        
    



'''
------- TEST CASES --------
How to run sample test cases ?

Simply run the file DT_SampleTestCase.py
Follow convention and do not change any file / function names

'''


'\n------- TEST CASES --------\nHow to run sample test cases ?\n\nSimply run the file DT_SampleTestCase.py\nFollow convention and do not change any file / function names\n\n'

In [12]:


def test_case():
    outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunny,sunny,sunny,sunny'.split(',')
    temp = 'hot,cool,mild,hot,mild,cool,cool,mild,mild,hot,hot,mild,cool,mild'.split(',')
    humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',')
    windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',')
    play = 'yes,yes,yes,yes,yes,yes,no,yes,no,no,no,no,yes,yes'.split(',')
    dataset ={'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play}
    df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])
    try:
        if get_entropy_of_dataset(df) >=0.938 and get_entropy_of_dataset(df)<=0.942:
            print("Test Case 1 for the function get_entropy_of_dataset PASSED")
        else:
            print("Test Case 1 for the function get_entropy_of_dataset FAILED")
    except:
        print("Test Case 1 for the function get_entropy_of_datasets FAILED")
    
    try:
        if get_entropy_of_attribute(df,'outlook')>=0.691 and get_entropy_of_attribute(df,'outlook')<=0.695 :
            print("Test Case 2 for the function get_entropy_of_attribute PASSED")
        else:
            print("Test Case 2 for the function get_entropy_of_attribute FAILED")
            
    except:
         print("Test Case 2 for the function get_entropy_of_attributes FAILED")
        
    try:
        if get_entropy_of_attribute(df,'temp')>=0.908 and get_entropy_of_attribute(df,'temp')<=0.914:
            print("Test Case 3 for the function get_entropy_of_attribute PASSED")
        else:
            print("Test Case 3 for the function get_entropy_of_attribute FAILED")
            
    except:
        print("Test Case 3 for the function get_entropy_of_attribute FAILED")
        
    try:
        columns=['outlook','temp','humidity','windy','play']
        ans=get_selected_attribute(df)
        dictionary=ans[0]
        flag=(dictionary['outlook']>=0.244 and dictionary['outlook']<=0.248) and (dictionary['temp']>=0.0292 and dictionary['temp']<=0.0296)and(dictionary['humidity']>=0.150 and dictionary['humidity']<=0.154)and(dictionary['windy']>=0.046and dictionary['windy']<=0.05)and(ans[1]=='outlook')
        if flag:
            print("Test Case 4 for the function get_selected_attribute PASSED")
        else:
            print("Test Case 4 for the function get_selected_attribute FAILED")
            
    except:
        print("Test Case 4 for the function get_selected_attribute FAILED")
    


if __name__=="__main__":
	test_case()


Test Case 1 for the function get_entropy_of_dataset PASSED
Test Case 2 for the function get_entropy_of_attribute PASSED
Test Case 3 for the function get_entropy_of_attribute PASSED
Test Case 4 for the function get_selected_attribute PASSED
