<h1> <center> Information-based Learning Using Decision Tree

In [1]:
#Importing libraries
import pandas as pd
from collections import Counter
import math

<h3> 1. Fishing Data

In [2]:
# read file
file1 = pd.read_csv('fishing.data', names = ["Wind", "Water", "Air", "Forecast", "class"])
data_1 = pd.DataFrame(file1[8:]).reset_index().drop('index', 1)

# splitting into training and test set
train_df = data_1.sample(frac=0.7,random_state=100) #random state is a seed value
test_df = data_1.drop(train_df.index).reset_index().drop('index', 1)

train_df

Unnamed: 0,Wind,Water,Air,Forecast,class
11,Weak,Moderate,Warm,Sunny,Yes
12,Strong,Warm,Cool,Sunny,Yes
5,Weak,Cold,Cool,Rainy,No
1,Weak,Warm,Warm,Sunny,No
9,Strong,Moderate,Cool,Rainy,No
4,Strong,Cold,Cool,Rainy,No
6,Weak,Cold,Cool,Sunny,No
2,Strong,Warm,Warm,Cloudy,Yes
0,Strong,Warm,Warm,Sunny,Yes
10,Weak,Moderate,Cool,Sunny,Yes


In [3]:
#Function for calculating Entropy:
def entropy(vals):
    return sum([-val*math.log(val, 2) for val in vals])

def entropy_of_table(series):  
    count_class = Counter(x for x in series)   # Calculates numbers of each class 
    total_obs = len(series) # total number of rows
    #print("\n Number of observations for the Current Class: {0}".format(total_obs))
    vals = [x / total_obs for x in count_class.values()]  # x means number of each class
    return entropy(vals) 
    

In [4]:
# Function for calculating Information Gain:
def information_gain(train_df, attr_name, target_attribute_name):
    print("\n Information Gain Calculation of ", attr_name)
    
    train_df_split = train_df.groupby(attr_name) # Grouped based on attribute values:
   
    nobs = len(train_df.index) #number of observations in the dataset
   
    # Calculate the entropy for each attribute value:
    train_df_agg_ent = train_df_split.agg({target_attribute_name : [entropy_of_table, lambda x: len(x)/nobs] })[target_attribute_name]
    train_df_agg_ent.columns = ['Entropy', 'PropObservations']
    print(train_df_agg_ent['Entropy']) # gives entropy value for each attribute
    
    # Calculate Information Gain:
    new_entropy = sum(train_df_agg_ent['Entropy'] * train_df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_table(train_df[target_attribute_name])
    
    return old_entropy - new_entropy
 
#print('\nInformation Gain for *** is :'+ str( information_gain(train_df, attr, 'class')),"\n\n")
    
print('\nInformation Gain for Wind is :'+ str( information_gain(train_df, 'Wind', 'class')),"\n\n")
print('\nInformation Gain for Water is:' + str( information_gain(train_df, 'Water', 'class')),"\n\n")
print('\nInformation Gain for Air is:' + str( information_gain(train_df, 'Air', 'class')),"\n\n")
print('\nInformation Gain for Forecast is:' + str( information_gain(train_df, 'Forecast','class')),"\n\n")




 Information Gain Calculation of  Wind
Wind
Strong    0.970951
Weak      0.970951
Name: Entropy, dtype: float64

Information Gain for Wind is :0.02904940554533142 



 Information Gain Calculation of  Water
Water
Cold        0.000000
Moderate    0.918296
Warm        0.811278
Name: Entropy, dtype: float64

Information Gain for Water is:0.4 



 Information Gain Calculation of  Air
Air
Cool    0.918296
Warm    0.811278
Name: Entropy, dtype: float64

Information Gain for Air is:0.12451124978365313 



 Information Gain Calculation of  Forecast
Forecast
Cloudy    0.000000
Rainy     0.000000
Sunny     0.918296
Name: Entropy, dtype: float64

Information Gain for Forecast is:0.44902249956730633 




In [5]:
# Function for ID3 algorithm: 
def id3(train_df, target_attribute_name, attribute_names):
    
    count_class = Counter(x for x in train_df[target_attribute_name])# number of each classes 
    
    # Check if all observations have the same class:
    if len(count_class) == 1:
        return next(iter(count_class))  #returns the leaf of that class label
    
    # Check if the datset is empty or check if there are no more attributes and return most common class if yes:
    elif train_df.empty or (not attribute_names):
        return max(count_class)  # Return most common class
    
    else:
        # Set default value for next recursive call of this function:
        default_class = max(count_class.keys()) 
        
        # Calculate the Information Gain of the attributes:
        gainz = [information_gain(train_df, attr, target_attribute_name) for attr in attribute_names] 
        
        # Determine best attribute to split on based on the value of Information Gain:
        index_of_max = gainz.index(max(gainz)) # getting index of maximum value of information gain
        best_attr = attribute_names[index_of_max] # gettin the best attribute 
        
        # Create an empty tree/nested dictionary to be populated: 
        tree = {best_attr:{}} # setting up the tree with best attribute as the node 
        
        remaining_attribute_names = [i for i in attribute_names if i != best_attr] # list of attributes (without the best attribute)
        #print('---------------------------------------------------------------------')
        #for i in remaining_attribute_names:
            #value = str(information_gain(train_df,i, target_attribute_name))
            #print('Information Gain for {0} is:{1}'.format(str(i),str(value)))
        #print('---------------------------------------------------------------------')
        
        
        # Split the dataset based on the attribute values of the best attribute and recursively call this algorithm on each splits:
        for attr_val, data_subset in train_df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names)
                        
            tree[best_attr][attr_val] = subtree # populates the empty tree with subtrees
        return tree



In [6]:
# Initial entropy of the 'class' attribute for given dataset.
total_entropy = entropy_of_table(train_df['class'])

print("\nTotal Entropy for the given dataset: ",total_entropy,'\n\n')

attribute_names = list(train_df.columns)
print("List of Attributes:", attribute_names,'\n') 
attribute_names.remove('class') # Remove the class attribute 
print("Predicting Attributes:", attribute_names, '\n\n')


# Call ID3 recursive Algorithm:
from pprint import pprint # pprint or pretty print gives a better representation of the Decision Tree
tree = id3(train_df,'class',attribute_names)
print("\n\n Decision Tree :\n")
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())


Total Entropy for the given dataset:  1.0 


List of Attributes: ['Wind', 'Water', 'Air', 'Forecast', 'class'] 

Predicting Attributes: ['Wind', 'Water', 'Air', 'Forecast'] 



 Information Gain Calculation of  Wind
Wind
Strong    0.970951
Weak      0.970951
Name: Entropy, dtype: float64

 Information Gain Calculation of  Water
Water
Cold        0.000000
Moderate    0.918296
Warm        0.811278
Name: Entropy, dtype: float64

 Information Gain Calculation of  Air
Air
Cool    0.918296
Warm    0.811278
Name: Entropy, dtype: float64

 Information Gain Calculation of  Forecast
Forecast
Cloudy    0.000000
Rainy     0.000000
Sunny     0.918296
Name: Entropy, dtype: float64

 Information Gain Calculation of  Wind
Wind
Strong    0.0
Weak      1.0
Name: Entropy, dtype: float64

 Information Gain Calculation of  Water
Water
Cold        0.000000
Moderate    0.000000
Warm        0.918296
Name: Entropy, dtype: float64

 Information Gain Calculation of  Air
Air
Cool    0.918296
Warm    0.918296
Nam

In [7]:
def classify(data):
    Wind = data[0]
    Water = data[1]
    Air = data[2]
    Forecast = data[3]
    
    if Forecast == 'Sunny':
        if Water == 'Warm':
            if Wind == 'Strong':
                return 'Yes'
            else:
                return 'No'
        elif Water == 'Moderate':
                return 'Yes'
        else:
            return 'No'
    elif Forecast == 'Rainy':
        return 'No'
    else:
        return 'Yes'

In [8]:
test_df['Predicted']=""
hit_count = 0
for index,row in test_df.iterrows():
    arg = test_df.iloc[index].tolist()
    test_df.iloc[index]['Predicted'] = classify(arg)
    if test_df.iloc[index]['Predicted'] == test_df.iloc[index]['class']:
        hit_count+=1
test_df


Unnamed: 0,Wind,Water,Air,Forecast,class,Predicted
0,Strong,Moderate,Warm,Rainy,Yes,No
1,Strong,Moderate,Warm,Sunny,Yes,Yes
2,Strong,Cold,Cool,Sunny,Yes,No
3,Weak,Moderate,Warm,Rainy,No,No


In [9]:
total_rows = len(test_df)
Accuracy = (hit_count)*100/total_rows
Accuracy

50.0

<h3> 2. Contact-lenses Data
    

In [10]:
# read file
file2 = pd.read_csv('contact-lenses.data', names = ["Age", "Prescription", "Astigmatism", "Tear-rate", "class"])
data_2 = pd.DataFrame(file2[8:]).reset_index().drop('index', 1)

# splitting into training and test set
train_df = data_2.sample(frac=0.7,random_state=100) #random state is a seed value
test_df = data_2.drop(train_df.index).reset_index().drop('index', 1)
train_df


Unnamed: 0,Age,Prescription,Astigmatism,Tear-rate,class
6,young,hypermetrope,yes,reduced,none
9,pre-presbyopic,myope,no,normal,soft
18,presbyopic,myope,yes,reduced,none
21,presbyopic,hypermetrope,no,normal,soft
13,pre-presbyopic,hypermetrope,no,normal,soft
5,young,hypermetrope,no,normal,soft
12,pre-presbyopic,hypermetrope,no,reduced,none
22,presbyopic,hypermetrope,yes,reduced,none
11,pre-presbyopic,myope,yes,normal,hard
4,young,hypermetrope,no,reduced,none


In [11]:
#Function for calculating Entropy:
def entropy(vals):
    return sum([-val*math.log(val, 2) for val in vals])

def entropy_of_table(series):  
    count_class = Counter(x for x in series)   # Calculates numbers of each class 
    total_obs = len(series) # total number of rows
    #print("\n Number of observations for the Current Class: {0}".format(total_obs))
    vals = [x / total_obs for x in count_class.values()]  # x means number of each class
    return entropy(vals) 
    

In [12]:
# Function for calculating Information Gain:
def information_gain(train_df, attr_name, target_attribute_name):
    print("\n Information Gain Calculation of ", attr_name)
    
    train_df_split = train_df.groupby(attr_name) # Grouped based on attribute values:
   
    nobs = len(train_df.index) #number of observations in the dataset
   
    # Calculate the entropy for each attribute value:
    train_df_agg_ent = train_df_split.agg({target_attribute_name : [entropy_of_table, lambda x: len(x)/nobs] })[target_attribute_name]
    train_df_agg_ent.columns = ['Entropy', 'PropObservations']
    print(train_df_agg_ent['Entropy']) # gives entropy value for each attribute
    
    # Calculate Information Gain:
    new_entropy = sum(train_df_agg_ent['Entropy'] * train_df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_table(train_df[target_attribute_name])
    
    return old_entropy - new_entropy
 
#print('\nInformation Gain for *** is :'+ str( information_gain(train_df, attr, 'class')),"\n\n")
    
print('\nInformation Gain for Age is :'+ str( information_gain(train_df, 'Age', 'class')),"\n\n")
print('\nInformation Gain for Prescription is:' + str( information_gain(train_df, 'Prescription', 'class')),"\n\n")
print('\nInformation Gain for Astigmatism is:' + str( information_gain(train_df, 'Astigmatism', 'class')),"\n\n")
print('\nInformation Gain for Tear-rate is:' + str( information_gain(train_df, 'Tear-rate','class')),"\n\n")





 Information Gain Calculation of  Age
Age
pre-presbyopic    1.521928
presbyopic        1.148835
young             0.970951
Name: Entropy, dtype: float64

Information Gain for Age is :0.1265711441922801 



 Information Gain Calculation of  Prescription
Prescription
hypermetrope    0.881291
myope           1.556657
Name: Entropy, dtype: float64

Information Gain for Prescription is:0.17343770196462627 



 Information Gain Calculation of  Astigmatism
Astigmatism
no     1.000000
yes    0.863121
Name: Entropy, dtype: float64

Information Gain for Astigmatism is:0.3891825234105245 



 Information Gain Calculation of  Tear-rate
Tear-rate
normal     1.435521
reduced    0.000000
Name: Entropy, dtype: float64

Information Gain for Tear-rate is:0.5728389611412549 




In [13]:
# Function for ID3 algorithm: 
def id3(train_df, target_attribute_name, attribute_names):
    
    count_class = Counter(x for x in train_df[target_attribute_name])# number of each classes 
    
    # Check if all observations have the same class:
    if len(count_class) == 1:
        return next(iter(count_class))  #returns the leaf of that class label
    
    # Check if the datset is empty or check if there are no more attributes and return most common class if yes:
    elif train_df.empty or (not attribute_names):
        return max(count_class)  # Return most common class
    
    else:
        # Set default value for next recursive call of this function:
        default_class = max(count_class.keys()) 
        
        # Calculate the Information Gain of the attributes:
        gainz = [information_gain(train_df, attr, target_attribute_name) for attr in attribute_names] 
        
        # Determine best attribute to split on based on the value of Information Gain:
        index_of_max = gainz.index(max(gainz)) # getting index of maximum value of information gain
        best_attr = attribute_names[index_of_max] # gettin the best attribute 
        
        # Create an empty tree/nested dictionary to be populated: 
        tree = {best_attr:{}} # setting up the tree with best attribute as the node 
        
        remaining_attribute_names = [i for i in attribute_names if i != best_attr] # list of attributes (without the best attribute)
        #print('---------------------------------------------------------------------')
        #for i in remaining_attribute_names:
            #value = str(information_gain(train_df,i, target_attribute_name))
            #print('Information Gain for {0} is:{1}'.format(str(i),str(value)))
        #print('---------------------------------------------------------------------')
        
        
        # Split the dataset based on the attribute values of the best attribute and recursively call this algorithm on each splits:
        for attr_val, data_subset in train_df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names)
                        
            tree[best_attr][attr_val] = subtree # populates the empty tree with subtrees
        return tree



In [14]:
#print("\n Given dataset:\n\n", train_df)

# Initial entropy of the 'class' attribute for given dataset.
total_entropy = entropy_of_table(train_df['class'])

print("\nTotal Entropy for the given dataset: ",total_entropy,'\n\n')

attribute_names = list(train_df.columns)
print("List of Attributes:", attribute_names,'\n') 
attribute_names.remove('class') # Remove the class attribute 
print("Predicting Attributes:", attribute_names, '\n\n')


# Call ID3 recursive Algorithm:
from pprint import pprint # pprint or pretty print gives a better representation of the Decision Tree
tree = id3(train_df,'class',attribute_names)
print("\n\n Decision Tree :\n")
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())


Total Entropy for the given dataset:  1.3328204045850196 


List of Attributes: ['Age', 'Prescription', 'Astigmatism', 'Tear-rate', 'class'] 

Predicting Attributes: ['Age', 'Prescription', 'Astigmatism', 'Tear-rate'] 



 Information Gain Calculation of  Age
Age
pre-presbyopic    1.521928
presbyopic        1.148835
young             0.970951
Name: Entropy, dtype: float64

 Information Gain Calculation of  Prescription
Prescription
hypermetrope    0.881291
myope           1.556657
Name: Entropy, dtype: float64

 Information Gain Calculation of  Astigmatism
Astigmatism
no     1.000000
yes    0.863121
Name: Entropy, dtype: float64

 Information Gain Calculation of  Tear-rate
Tear-rate
normal     1.435521
reduced    0.000000
Name: Entropy, dtype: float64

 Information Gain Calculation of  Age
Age
pre-presbyopic    0.918296
presbyopic        1.500000
young             0.000000
Name: Entropy, dtype: float64

 Information Gain Calculation of  Prescription
Prescription
hypermetrope    0.8112

In [15]:
test_df.iloc[0].tolist()

['young', 'myope', 'yes', 'reduced', 'none']

In [16]:
def classify(data):
    Age = data[0]
    Prescription = data[1]
    Astigmatism = data[2]
    Tear_rate = data[3]
    
    
    if Tear_rate == 'normal':
        if Astigmatism == 'yes':
            if Prescription == 'hypermetrope':
                return 'none'
            else:
                return 'hard'
                
        else:
            if Age == 'pre-presbyopic':
                return 'soft'
            elif Age == 'young':
                return 'soft'
            else:
                if Prescription == 'hypermetrope':
                    return 'soft'
                else:
                    return 'none'         
    else:
        return 'none'

In [17]:
test_df['Predicted']=""
hit_count = 0
for index,row in test_df.iterrows():
    arg = test_df.iloc[index].tolist()
    test_df.iloc[index]['Predicted'] = classify(arg)
    if test_df.iloc[index]['Predicted'] == test_df.iloc[index]['class']:
        hit_count+=1
test_df

  

Unnamed: 0,Age,Prescription,Astigmatism,Tear-rate,class,Predicted
0,young,myope,yes,reduced,none,none
1,young,myope,yes,normal,hard,hard
2,young,hypermetrope,yes,normal,hard,none
3,pre-presbyopic,myope,no,reduced,none,none
4,pre-presbyopic,myope,yes,reduced,none,none
5,pre-presbyopic,hypermetrope,yes,normal,none,none
6,presbyopic,myope,no,reduced,none,none


In [18]:
total_rows = len(test_df)
Accuracy = (hit_count)*100/total_rows
Accuracy

85.71428571428571

<h3> 3. Iris Data