# AI Lab 6 Q1 Meher Shrishti Nigam 20BRS1193

In [7]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [8]:
import pandas as pd 
import numpy as np 

# Quick value count calculator
from collections import Counter

In [9]:
dataset = pd.read_csv('tennis2.csv',  names=['outlook','temp','humidity','wind','play'])
dataset=dataset.drop('day',axis=0)  

In [10]:
dataset.head(5)

Unnamed: 0,outlook,temp,humidity,wind,play
D1,Sunny,Hot,High,Weak,No
D2,Sunny,Hot,High,Strong,No
D3,Overcast,Hot,High,Weak,Yes
D4,Rain,Mild,High,Weak,Yes
D5,Rain,Cool,Normal,Weak,Yes


In [11]:
def calc_total_entropy(train_data, label, class_list):
    total_row = train_data.shape[0] #the total size of the dataset
    total_entr = 0
    
    for c in class_list: # for each class in the label
        total_class_count = train_data[train_data[label] == c].shape[0] #number of the class
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) #entropy of the class
        total_entr += total_class_entr #adding the class entropy to the total entropy of the dataset        
    print("Entropy before splitting ")
    print(total_entr)
    return total_entr

In [12]:
def calc_entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0] #row count of class c 
        
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count #probability of the class
            entropy_class = - probability_class * np.log2(probability_class)  #entropy
        entropy += entropy_class
    return entropy

In [13]:
def calc_info_gain(feature_name, train_data, label, class_list):
    feature_value_list = train_data[feature_name].unique() #unqiue values of the feature
    total_row = train_data.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list: #all possible values of a feature
        feature_value_data = train_data[train_data[feature_name] == feature_value] #filtering rows with that feature_value
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list) #calculcating entropy for the feature value
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy #calculating information of the feature value
        
    return calc_total_entropy(train_data, label, class_list) - feature_info #calculating information gain by subtracting

In [14]:
#find feature with hightest information gain
def find_most_informative_feature(train_data, label, class_list):
    feature_list = train_data.columns.drop(label) # feature names in the dataset
                                            
    max_info_gain = -1
    max_info_feature = None
    
    for feature in feature_list:  
        feature_info_gain = calc_info_gain(feature, train_data, label, class_list)
        if max_info_gain < feature_info_gain: 
            max_info_gain = feature_info_gain
            max_info_feature = feature
            
    return max_info_feature

In [15]:
def generate_sub_tree(feature_name, train_data, label, class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False) #dictionary of the count of unqiue feature value
    tree = {} #sub tree or node
    
    for feature_value, count in feature_value_count_dict.iteritems():
        feature_value_data = train_data[train_data[feature_name] == feature_value] #dataset with only feature_name = feature_value
        
        assigned_to_node = False #flag for tracking feature_value is pure class or not
        for c in class_list: #for each class
            class_count = feature_value_data[feature_value_data[label] == c].shape[0] #count of class c

            if class_count == count: #count of feature_value = count of class (pure class)
                tree[feature_value] = c #adding node to the tree
                train_data = train_data[train_data[feature_name] != feature_value] #removing rows with feature_value
                assigned_to_node = True
        if not assigned_to_node: #not pure class
            tree[feature_value] = "?" #should extend the node, so the branch is marked with ?
            
    return tree, train_data

In [16]:
def make_tree(root, prev_feature_value, train_data, label, class_list):
    if train_data.shape[0] != 0: #if dataset becomes empty after updating
        max_info_feature = find_most_informative_feature(train_data, label, class_list) #most informative feature
        tree, train_data = generate_sub_tree(max_info_feature, train_data, label, class_list) #getting tree node and updated dataset
        next_root = None
        
        if prev_feature_value != None: #add to intermediate node of the tree
            root[prev_feature_value] = dict()
            root[prev_feature_value][max_info_feature] = tree
            next_root = root[prev_feature_value][max_info_feature]
        else: #add to root of the tree
            root[max_info_feature] = tree
            next_root = root[max_info_feature]
        
        for node, branch in list(next_root.items()): #iterating the tree node
            if branch == "?": #if it is expandable
                feature_value_data = train_data[train_data[max_info_feature] == node] #using the updated dataset
                make_tree(next_root, node, feature_value_data, label, class_list) #recursive call with updated dataset

In [17]:
def id3(train_data_m, label):
    train_data = train_data_m.copy() #getting a copy of the dataset
    tree = {} #tree which will be updated
    class_list = train_data[label].unique() #getting unqiue classes of the label
    make_tree(tree, None, train_data_m, label, class_list) #start calling recursion
    return tree

In [18]:
tree = id3(dataset, 'play')

Entropy before splitting 
0.9402859586706311
Entropy before splitting 
0.9402859586706311
Entropy before splitting 
0.9402859586706311
Entropy before splitting 
0.9402859586706311
Entropy before splitting 
0.9709505944546686
Entropy before splitting 
0.9709505944546686
Entropy before splitting 
0.9709505944546686
Entropy before splitting 
0.9709505944546686
Entropy before splitting 
0.9709505944546686
Entropy before splitting 
0.9709505944546686
Entropy before splitting 
0.9709505944546686
Entropy before splitting 
0.9709505944546686


In [19]:
def predict(tree, instance):
    if not isinstance(tree, dict): #if it is leaf node
        return tree #return the value
    else:
        root_node = next(iter(tree)) #getting first key/feature name of the dictionary
        feature_value = instance[root_node] #value of the feature
        if feature_value in tree[root_node]: #checking the feature value in current tree node
            return predict(tree[root_node][feature_value], instance) #goto next feature
        else:
            return None

In [20]:
def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows(): #for each row in the dataset
        result = predict(tree, test_data_m.iloc[index]) #predict the row
        if result == test_data_m[label].iloc[index]: #predicted value and expected value is same or not
            correct_preditct += 1 
        else:
            wrong_preditct += 1 
    accuracy = correct_preditct / (correct_preditct + wrong_preditct) #calculating accuracy
    return accuracy

In [21]:
test_dataset = pd.read_csv("tennis_test.csv")
accuracy = evaluate(tree, test_dataset, 'play') # evaluating the test dataset
print(accuracy)

0.8


# DecisionTree using sklearn -> 

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [23]:
# load dataset
col_names=['outlook','temp','humidity','wind','play']
dt = pd.read_csv("tennis.csv", header=None, names=col_names)
print(dt)

      outlook  temp humidity    wind play
D1      Sunny   Hot     High    Weak   No
D2      Sunny   Hot     High  Strong   No
D3   Overcast   Hot     High    Weak  Yes
D4       Rain  Mild     High    Weak  Yes
D5       Rain  Cool   Normal    Weak  Yes
D6       Rain  Cool   Normal  Strong   No
D7   Overcast  Cool   Normal  Strong  Yes
D8      Sunny  Mild     High    Weak   No
D9      Sunny  Cool   Normal    Weak  Yes
D10      Rain  Mild   Normal    Weak  Yes
D11     Sunny  Mild   Normal  Strong  Yes
D12  Overcast  Mild     High  Strong  Yes
D13  Overcast   Hot   Normal    Weak  Yes
D14      Rain  Mild     High  Strong   No


In [24]:
# preprocessing - to deal with categorical nominal data 
!pip install scikit-learn
from sklearn.preprocessing import OneHotEncoder

Defaulting to user installation because normal site-packages is not writeable


In [25]:
feature_cols = ['outlook','temp','humidity','wind']
X = dt[feature_cols] # Features
y = dt.play # Target variable

# Using OneHotEncoder
ohe = OneHotEncoder(sparse=False)
feature_array = ohe.fit_transform(dt[['outlook','temp','humidity','wind']])
print(feature_array)

[[0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
 [0. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 1. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 1. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
 [0. 0. 1. 0. 0. 1. 1. 0. 0. 1.]
 [0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 0. 1. 0. 1. 0. 1.]
 [0. 0. 1. 0. 0. 1. 0. 1. 1. 0.]
 [1. 0. 0. 0. 0. 1. 1. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0. 0. 1. 0. 1.]
 [0. 1. 0. 0. 0. 1. 1. 0. 1. 0.]]


In [26]:
# Getting the labels
feature_labels = ohe.categories_
print(feature_labels)
print("\n")
feature_labels = np.array(feature_labels,dtype=object).ravel()
print(feature_labels)

[array(['Overcast', 'Rain', 'Sunny'], dtype=object), array(['Cool', 'Hot', 'Mild'], dtype=object), array(['High', 'Normal'], dtype=object), array(['Strong', 'Weak'], dtype=object)]


[array(['Overcast', 'Rain', 'Sunny'], dtype=object)
 array(['Cool', 'Hot', 'Mild'], dtype=object)
 array(['High', 'Normal'], dtype=object)
 array(['Strong', 'Weak'], dtype=object)]


In [27]:
cols = ['Overcast', 'Rain', 'Sunny','Cool', 'Hot', 'Mild','High', 'Normal','Strong', 'Weak']

In [28]:
# Creating final dataframe
dt2 = pd.DataFrame(feature_array, columns = cols)
print(dt2)

    Overcast  Rain  Sunny  Cool  Hot  Mild  High  Normal  Strong  Weak
0        0.0   0.0    1.0   0.0  1.0   0.0   1.0     0.0     0.0   1.0
1        0.0   0.0    1.0   0.0  1.0   0.0   1.0     0.0     1.0   0.0
2        1.0   0.0    0.0   0.0  1.0   0.0   1.0     0.0     0.0   1.0
3        0.0   1.0    0.0   0.0  0.0   1.0   1.0     0.0     0.0   1.0
4        0.0   1.0    0.0   1.0  0.0   0.0   0.0     1.0     0.0   1.0
5        0.0   1.0    0.0   1.0  0.0   0.0   0.0     1.0     1.0   0.0
6        1.0   0.0    0.0   1.0  0.0   0.0   0.0     1.0     1.0   0.0
7        0.0   0.0    1.0   0.0  0.0   1.0   1.0     0.0     0.0   1.0
8        0.0   0.0    1.0   1.0  0.0   0.0   0.0     1.0     0.0   1.0
9        0.0   1.0    0.0   0.0  0.0   1.0   0.0     1.0     0.0   1.0
10       0.0   0.0    1.0   0.0  0.0   1.0   0.0     1.0     1.0   0.0
11       1.0   0.0    0.0   0.0  0.0   1.0   1.0     0.0     1.0   0.0
12       1.0   0.0    0.0   0.0  1.0   0.0   0.0     1.0     0.0   1.0
13    

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dt2, y, test_size=0.2, random_state=42)

In [30]:
# print(X, y)
model_dec = DecisionTreeClassifier(max_depth = 3, random_state = 42, criterion = 'entropy')

# Train Decision Tree Classifer
model_dec = model_dec.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = model_dec.predict(x_test)

print(y_pred)
print(y_test)

accuracy=model_dec.score(x_test,y_test)
print(accuracy)

['Yes' 'Yes' 'No']
D10    Yes
D12    Yes
D1      No
Name: play, dtype: object
1.0


### Our accuracy is 100%. y_pred and y_test are also the same. 

In [38]:
from sklearn.tree import export_text
text_representation = export_text(model_dec)
print(text_representation)

|--- feature_0 <= 0.50
|   |--- feature_7 <= 0.50
|   |   |--- feature_1 <= 0.50
|   |   |   |--- class: No
|   |   |--- feature_1 >  0.50
|   |   |   |--- class: No
|   |--- feature_7 >  0.50
|   |   |--- feature_8 <= 0.50
|   |   |   |--- class: Yes
|   |   |--- feature_8 >  0.50
|   |   |   |--- class: No
|--- feature_0 >  0.50
|   |--- class: Yes

