In [22]:
'''
Assume df is a pandas dataframe object of the dataset given
'''
import numpy as np
import pandas as pd
import random

#to get all the values in the last column of the data frame

def get_all_values_of_final_output(df):
    s=set()
    for i in df.iloc[:,-1]:
        if i==np.nan:
            continue
        if i not in s:
            s.add(i)
    return s

'''Calculate the entropy of the enitre dataset'''
	#input:pandas_dataframe
	#output:int/float/double/large

#Formula -> For all outputs present in the last column of the dataframe
# apply -(k*log2(k))
# where k=Count of each output/length of data frame
def get_entropy_of_dataset(df):
    entropy = 0
    di={}
    total=0
    for i in df.iloc[:,-1]:#all the rows for the last column in the dataframe
        if i==np.nan:
            continue
        if i not in di:
            di[i]=0
        di[i]+=1
        total+=1
    if total!=0:
        for i in di:
            k=di[i]/total
            entropy+=-(k*np.log2(k))          
    return entropy

'''Return entropy of the attribute provided as parameter'''
	#input:pandas_dataframe,str   {i.e the column name ,ex: Temperature in the Play tennis dataset}
	#output:int/float/double/large

#Formula -> For all outputs present in that attribute's column of the dataframe
#We need to get the ratio of how many of these O/Ps provide each of the outputs
#of the last column and for each of the outputs of the last columns get pilog2(pi)
#if pi=0 then skip it.
#average info of an attribute might be entropy and you multiply the previous
#step answer to (total of current output attribute)/(total # of records in the data frame)
def get_entropy_of_attribute(df,attribute):
    
    entropy_of_attribute = 0
    di={}
    totalfinal=0
    k=df[attribute]
    s=get_all_values_of_final_output(df)
    for i in range(len(k)): 
        if k[i]==np.nan:
            continue
        if k[i] not in di:
            di[k[i]]={}
            for all_values in s:
                di[k[i]][all_values]=0                
        di[k[i]][df.iloc[i][-1]]+=1
        totalfinal+=1
    for i in di:
        total=0
        for j in di[i]:
            total+=di[i][j]
        if total!=0:
            for j in s:
                if di[i][j]!=0:
                    k=di[i][j]/total
                    entropy_of_attribute+=-(k)*np.log2(k)*(total/totalfinal)
    return abs(entropy_of_attribute)

'''Return Information Gain of the attribute provided as parameter'''
	#input:int/float/double/large,int/float/double/large
	#output:int/float/double/large

def get_information_gain(df,attribute):
	information_gain = 0
	datasetentropy=get_entropy_of_dataset(df)
	attributeentropy=get_entropy_of_attribute(df,attribute)
	information_gain=datasetentropy-attributeentropy
	return information_gain

''' Returns Attribute with highest info gain'''  
	#input: pandas_dataframe
	#output: ({dict},'str')

def get_selected_attribute(df):
    information_gains={}
    selected_column=''
    maxi=-1 #since no info gain can ever be negative
    for attribute in df.columns[:-1]:
        information_gains[attribute]=get_information_gain(df,attribute)
        if information_gains[attribute]>maxi:
            maxi=information_gains[attribute]
            selected_column=attribute
    return (information_gains,selected_column)  

    '''
	Return a tuple with the first element as a dictionary which has IG of all columns 
	and the second element as a string with the name of the column selected

	example : ({'A':0.123,'B':0.768,'C':1.23} , 'C')
    '''


In [23]:
def test_case():
    outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunny,sunny,sunny,sunny'.split(',')
    temp = 'hot,cool,mild,hot,mild,cool,cool,mild,mild,hot,hot,mild,cool,mild'.split(',')
    humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',')
    windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',')
    play = 'yes,yes,yes,yes,yes,yes,no,yes,no,no,no,no,yes,yes'.split(',')
    dataset ={'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play}
    df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])
    try:
        if get_entropy_of_dataset(df) >=0.938 and get_entropy_of_dataset(df)<=0.942:
            print("Test Case 1 for the function get_entropy_of_dataset PASSED")
        else:
            print("Test Case 1 for the function get_entropy_of_dataset FAILED")
    except:
        print("Test Case 1 for the function get_entropy_of_dataset FAILED")
    
    try:
        if get_entropy_of_attribute(df,'outlook')>=0.691 and get_entropy_of_attribute(df,'outlook')<=0.695 :
            print("Test Case 2 for the function get_entropy_of_attribute PASSED")
        else:
            print("Test Case 2 for the function get_entropy_of_attribute FAILED")
            
    except:
         print("Test Case 2 for the function get_entropy_of_attribute FAILED")
        
    try:
        if get_entropy_of_attribute(df,'temp')>=0.908 and get_entropy_of_attribute(df,'temp')<=0.914:
            print("Test Case 3 for the function get_entropy_of_attribute PASSED")
        else:
            print("Test Case 3 for the function get_entropy_of_attribute FAILED")
            
    except:
        print("Test Case 3 for the function get_entropy_of_attribute FAILED")
        
    try:
        columns=['outlook','temp','humidity','windy','play']
        ans=get_selected_attribute(df)
        dictionary=ans[0]
        flag=(dictionary['outlook']>=0.244 and dictionary['outlook']<=0.248) and (dictionary['temp']>=0.0292 and dictionary['temp']<=0.0296)and(dictionary['humidity']>=0.150 and dictionary['humidity']<=0.154)and(dictionary['windy']>=0.046and dictionary['windy']<=0.05)and(ans[1]=='outlook')
        if flag:
            print("Test Case 4 for the function get_selected_attribute PASSED")
        else:
            print("Test Case 4 for the function get_selected_attribute FAILED")
            
    except:
        print("Test Case 4 for the function get_selected_attribute FAILED")
    


test_case()

Test Case 1 for the function get_entropy_of_dataset PASSED
Test Case 2 for the function get_entropy_of_attribute PASSED
Test Case 3 for the function get_entropy_of_attribute PASSED
Test Case 4 for the function get_selected_attribute PASSED


In [18]:
salary = 'tier1,tier2,tier1,tier1,tier2,tier1,tier1'.split(',')
location = 'mum,blr,blr,hyd,mum,hyd,hyd'.split(',')
job = 'yes,yes,no,no,yes,no,no'.split(',')
dataset ={'salary':salary,'location':location,'job':job}
df = pd.DataFrame(dataset,columns=['salary','location','job'])

print(get_entropy_of_dataset(df))
print(get_entropy_of_attribute(df,'salary'))
print(get_entropy_of_attribute(df,'location'))
print(get_information_gain(df,'salary'))
print(get_information_gain(df,'location'))




toothed='true,true,true,false,true,true,true,true,true,false'.split(',')
breathes='true,true,true,true,true,true,false,true,true,true'.split(',')
legs='true,true,false,true,true,true,false,false,true,true'.split(',')
species='mammal,mammal,reptile,mammal,mammal,mammal,reptile,reptile,mammal,reptile'.split(',')

dataset ={'toothed':toothed,'breathes':breathes,'legs':legs,'species':species}
df = pd.DataFrame(dataset,columns=['toothed','breathes','legs','species'])
print(get_entropy_of_dataset(df))
print(get_entropy_of_attribute(df,'toothed'))
print(get_entropy_of_attribute(df,'breathes'))
print(get_entropy_of_attribute(df,'legs'))
print(get_information_gain(df,'toothed'))
print(get_information_gain(df,'breathes'))
print(get_information_gain(df,'legs'))


df=pd.read_csv(r"Test.csv")



category='A,A,A,A,A,A,A,A,A,B,B,B'.split(',')
result='Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y,Y'.split(',')



dataset ={'category':category,'result':result}
df = pd.DataFrame(dataset,columns=['category','result'])
print(df)


print(get_entropy_of_dataset(df))
print(get_entropy_of_attribute(df,'category'))


df=pd.read_csv(r"Test.csv")
print(get_entropy_of_dataset(df))
print('Dataset entropy : ',get_entropy_of_dataset(df))  #0.9709505944546686
print('Sky entropy : ',get_entropy_of_attribute(df, 'Sky')) #0.9509775004326937
print('Sky IG : ', get_information_gain(df, 'Sky')) #0.01997309402197489
print('Airtemp entropy : ',get_entropy_of_attribute(df, 'Airtemp')) #0.6490224995673063
print('Airtemp IG : ', get_information_gain(df, 'Airtemp')) #0.3219280948873623
print('Humidity entropy : ',get_entropy_of_attribute(df, 'Humidity')) #0.9509775004326937
print('Humidity IG : ', get_information_gain(df, 'Humidity')) #0.01997309402197489
print('Water entropy : ',get_entropy_of_attribute(df, 'Water')) #0.8
print('Water IG : ', get_information_gain(df, 'Water')) #0.17095059445466854
print('Forecast entropy : ',get_entropy_of_attribute(df, 'Forecast')) #0.9509775004326937
print('Forecast IG : ', get_information_gain(df, 'Forecast')) #0.01997309402197489
print(get_selected_attribute(df)) #Airtemp





df=pd.read_csv(r"Test1.csv")
print(df.columns)
print('Dataset entropy : ',get_entropy_of_dataset(df))
print(get_entropy_of_attribute(df,'Age'))
print(get_entropy_of_attribute(df,'Income'))
print(get_entropy_of_attribute(df,'Student'))
print(get_entropy_of_attribute(df,'Credit_rating'))














df=pd.read_csv(r"Test6.csv")
print(df)
print(get_entropy_of_dataset(df))
print(get_entropy_of_attribute(df,'color'))
print(get_entropy_of_attribute(df,'size'))
print(get_entropy_of_attribute(df,'act'))
print(get_information_gain(df,'age'))

print(get_information_gain(df,'age'))




df=pd.read_csv(r"Test7.csv")
print(df)
print(get_entropy_of_dataset(df))
print(get_entropy_of_attribute(df,'caprice'))
print(get_entropy_of_attribute(df,'topic'))
print(get_entropy_of_attribute(df,'lmt'))




print(get_information_gain(df,'lpss'))
print(get_information_gain(df,'pb'))




outlook = 'overcast,overcast,overcast,overcast,rainy,rainy,rainy,rainy,rainy,sunny,sunnys,sunny,sunny,sunnys'.split(',')
    # outlook = ['yes']*14

temp = 'hot,cool,mild,hot,mild,coolio,cool,mild,mild,hot,hotter,mild,cool,mild'.split(',')
humidity = 'high,normal,high,normal,high,normal,normal,normal,high,high,high,high,normal,normal'.split(',')
windy = 'FALSE,TRUE,TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,TRUE,FALSE,TRUE,FALSE,FALSE,TRUE'.split(',')
play = 'yes,yes,yess,yes,yes,yess,no,yes,no,no,no,none,none,yes'.split(',')

    # play = 'yes,yes,yes,yes,yes,yes,yes,yes,yess,yes,yes,yes,none,no,none'.split(',')
    # play = 'yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes'
    # play = (['no']*3)
    # play.extend(['yes']*11)
    # print(play)
dataset ={'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play}
    # dataset ={'play':play}
df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])

    # df = pd.DataFrame(dataset,columns=['play'])

print(get_entropy_of_dataset(df))
print(get_entropy_of_attribute(df,'outlook'))
print(get_entropy_of_attribute(df,'humidity'))
print(get_information_gain(df,'windy'))
print(get_information_gain(df,'humidity'))
print(get_information_gain(df,'temp'))

0.9852281360342515
0.5156629249195445
0.2857142857142857
0.46956521111470706
0.6995138503199658
0.9709505944546686
0.9635472023399719
0.8264662506490406
0.4141709450076292
0.00740339211469665
0.14448434380562802
0.5567796494470394
   category result
0         A      Y
1         A      Y
2         A      Y
3         A      Y
4         A      Y
5         A      Y
6         A      Y
7         A      Y
8         A      Y
9         B      Y
10        B      Y
11        B      Y
0.0
0.0
0.9709505944546686
Dataset entropy :  0.9709505944546686
Sky entropy :  0.9509775004326937
Sky IG :  0.01997309402197489
Airtemp entropy :  0.6490224995673063
Airtemp IG :  0.3219280948873623
Humidity entropy :  0.9509775004326937
Humidity IG :  0.01997309402197489
Water entropy :  0.8
Water IG :  0.17095059445466854
Forecast entropy :  0.9509775004326937
Forecast IG :  0.01997309402197489
({'Sky': 0.01997309402197489, 'Airtemp': 0.3219280948873623, 'Humidity': 0.01997309402197489, 'Wind': 0.3219280948873623,

In [20]:

df=pd.read_csv(r"Test.csv")
print(get_entropy_of_dataset(df))
print('Dataset entropy : ',get_entropy_of_dataset(df))  #0.9709505944546686
print('Sky entropy : ',get_entropy_of_attribute(df, 'Sky')) #0.9509775004326937
print('Sky IG : ', get_information_gain(df, 'Sky')) #0.01997309402197489
print('Airtemp entropy : ',get_entropy_of_attribute(df, 'Airtemp')) #0.6490224995673063
print('Airtemp IG : ', get_information_gain(df, 'Airtemp')) #0.3219280948873623
print('Humidity entropy : ',get_entropy_of_attribute(df, 'Humidity')) #0.9509775004326937
print('Humidity IG : ', get_information_gain(df, 'Humidity')) #0.01997309402197489
print('Water entropy : ',get_entropy_of_attribute(df, 'Water')) #0.8
print('Water IG : ', get_information_gain(df, 'Water')) #0.17095059445466854
print('Forecast entropy : ',get_entropy_of_attribute(df, 'Forecast')) #0.9509775004326937
print('Forecast IG : ', get_information_gain(df, 'Forecast')) #0.01997309402197489
print(get_selected_attribute(df)) #Airtemp

0.9709505944546686
Dataset entropy :  0.9709505944546686
Sky entropy :  0.9509775004326937
Sky IG :  0.01997309402197489
Airtemp entropy :  0.6490224995673063
Airtemp IG :  0.3219280948873623
Humidity entropy :  0.9509775004326937
Humidity IG :  0.01997309402197489
Water entropy :  0.8
Water IG :  0.17095059445466854
Forecast entropy :  0.9509775004326937
Forecast IG :  0.01997309402197489
({'Sky': 0.01997309402197489, 'Airtemp': 0.3219280948873623, 'Humidity': 0.01997309402197489, 'Wind': 0.3219280948873623, 'Water': 0.17095059445466854, 'Forecast': 0.01997309402197489}, 'Airtemp')


In [21]:
category='A,A,A,B,A,B,A,B,A,B,B,B'.split(',')
result='Y,Y,N,Y,Y,N,Y,Y,N,Y,Y,N'.split(',')
dataset ={'category':category,'result':result}
df = pd.DataFrame(dataset,columns=['category','result'])

print(get_entropy_of_dataset(df)) #0.9182958340544896
print(get_entropy_of_attribute(df,'category')) #0.9182958340544896
print(get_entropy_of_attribute(df,'result')) #0.0
print(get_selected_attribute(df)) #({'category': 0.0}, 'category')

0.9182958340544896
0.9182958340544896
0.0
({'category': 0.0}, 'category')
