In [66]:
import pandas as pd
import numpy as np
dataset= (pd.read_csv('dataset.csv'))

def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

def InfoGain(data,split_attribute_name,target_name="answer"):
    total_entropy = entropy(data[target_name])
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

def ID3(data,originaldata,features,target_attribute_name="answer",parent_node_class = None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data)==0:
        return [np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    elif len(features) ==0:
        return parent_node_class
    else:
        parent_node_class = [np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature:{}}
        features = [i for i in features if i != best_feature]
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
            tree[best_feature][value] = subtree
        return(tree)
    
def predict(query,tree,default = 1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]]
            except:
                return default

            result = tree[key][query[key]]
            
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result
            
def test(dataset,tree):
    queries = dataset.to_dict(orient = "records")
    predicted = pd.DataFrame(columns=["predicted"])
    for i in range(len(dataset)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
        print("\nInstance:",i+1)
        print(list(queries[i].values()))
        print("Actual label:",list(queries[i].values())[-1],end ='\t')
        print('Predicted Label:',predicted.loc[i,"predicted"])
    print('\nThe prediction accuracy is: ',(np.sum(predicted["predicted"] ==dataset["answer"])/len(data))*100,'%')\
    
tree = ID3(dataset,dataset,dataset.columns[:-1])

print("Decision Tree :")
print(tree)
test(dataset,tree)

Decision Tree :
{'outlook': {'Overcast': 'Yes', 'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}}, 'Sunny': {'humidity': {'High': 'No', 'Normal': 'Yes'}}}}

Instance: 1
['Sunny', 'Hot', 'High', 'Weak', 'No']
Actual label: No	Predicted Label: No

Instance: 2
['Sunny', 'Hot', 'High', 'Strong', 'No']
Actual label: No	Predicted Label: No

Instance: 3
['Overcast', 'Hot', 'High', 'Weak', 'Yes']
Actual label: Yes	Predicted Label: Yes

Instance: 4
['Rain', 'Mild', 'High', 'Weak', 'Yes']
Actual label: Yes	Predicted Label: Yes

Instance: 5
['Rain', 'Cool', 'Normal', 'Weak', 'Yes']
Actual label: Yes	Predicted Label: Yes

Instance: 6
['Rain', 'Cool', 'Normal', 'Strong', 'No']
Actual label: No	Predicted Label: No

Instance: 7
['Overcast', 'Cool', 'Normal', 'Strong', 'Yes']
Actual label: Yes	Predicted Label: Yes

Instance: 8
['Sunny', 'Mild', 'High', 'Weak', 'No']
Actual label: No	Predicted Label: No

Instance: 9
['Sunny', 'Cool', 'Normal', 'Weak', 'Yes']
Actual label: Yes	Predicted Label: Yes

Inst