## Decision Stump

### Data Split

In [469]:
import pandas as pd
import numpy as np
import pprint

array = np.array([('sunny','hot','high','weak','no'),
                ('sunny','hot','high','strong','no'),
                ('overcast','hot','high','weak','yes'),
                ('rain','mild','high','weak','yes'),
                ('rain','cool','normal','weak','yes'),
                ('rain','cool','normal','strong','no'),
                ('overcast','cool','normal','strong','yes'),
                ('sunny','mild','high','weak','no'),
                ('sunny','cool','normal','weak','yes'),
                ('rain','mild','normal','weak','yes'),
                ('sunny','mild','normal','strong','no'),
                ('overcast','mild','high','strong','yes'),
                ('overcast','hot','normal','weak','yes'),
                ('rain','mild','high','strong','no'),
                ('sunny','mild','high','strong','no')])
np.random.shuffle(array)
training, test = array[:10,:], array[10:,:]

def test_split(test_data):
    return test_data[:,:4],test_data[:,4]

def train_split(train_data):
    return train_data[:,:4],train_data[:,4]

X_test, y_test = test_split(test)
X_train, y_train = train_split(training)
################################################
#TO HAVE A BETTER UNDERSTANDING OF THE DATASET
################################################
play_dict = {'outlook': [], 'temperature': [], 'humidity': [], 'wind': [], 'playtennis': []}

for row in array:
    play_dict['outlook'].append(row[0])
    play_dict['temperature'].append(row[1])
    play_dict['humidity'].append(row[2])
    play_dict['wind'].append(row[3])
    play_dict['playtennis'].append(row[4])
                 
df = pd.DataFrame(play_dict)
df

Unnamed: 0,outlook,temperature,humidity,wind,playtennis
0,rain,cool,normal,strong,no
1,rain,mild,high,strong,no
2,rain,mild,normal,weak,yes
3,sunny,hot,high,strong,no
4,sunny,mild,normal,strong,no
5,rain,mild,high,weak,yes
6,sunny,cool,normal,weak,yes
7,overcast,hot,normal,weak,yes
8,sunny,hot,high,weak,no
9,overcast,mild,high,strong,yes


### Decision Tree Model

In [484]:
import math
import collections
class DecisionStump:
    '''
    Decision tree with depth=1 that should be able to handle 
    discrete data as seen in the data.
    
    Assumption: All features and target are discrete valued.
    
    The single node must be chosen using infromation gain and 
    entropy.
    '''
    
    def __init__(self):
        '''
        '''
        #print("In init")
    
    def entropy(self, left, right, total_instances):
        # left - no of yeses when split
        # right - no of nos when split
        # n - no_of_splits
        if(left == total_instances or right == total_instances):
            return 0
        elif(left == right):
            return 1
        else:
            prob_left = -left/total_instances
            prob_right = -right/total_instances
            #print(prob_right, prob_left,(math.log(left)/math.log(total_instances)))
            entropy = prob_left*(math.log(left/total_instances))+ prob_right*(math.log(right/total_instances))
            return entropy      
    
    def information_gain(self, X, Y, feature, classes):
        # feature - column no of feature to calculate information gain
        #print("Inside information gain")
        #print(feature,classes)
        dict_of_splits = {}
        #print(X,Y)
        #print(len(X))
        Y = Y.reshape((len(Y),1))
        unique, counts = np.unique(Y, return_counts=True)
        count_yes_no = dict(zip(unique, counts))
        #print(count_yes_no)
        #print(Y)
        data = np.concatenate((X,Y.reshape((len(Y),1))), axis=1)
        #pprint.pprint(data)
        
        ###################################################################
        count_classes_split = {}
        for value in classes:
            dict_of_splits[value] = []
            data_list = []
            for row in data:
                #print(row[feature])
                count_classes_split[value] = 0
                if row[feature]==value:
                    data_list.append(row.tolist())
                    if value in count_classes_split:
                        count_classes_split[value] += 1
                    else:
                        count_classes_split[value] = 1
            dict_of_splits[value].append(data_list)
            
        #pprint.pprint(dict_of_splits)
        #pprint.pprint(count_classes_split)
        
        ##################################################################
        information_gain = []
        entropy_split = {}
        #print("--------------------------------------------------------")
        #print("current entropy:",count_yes_no["yes"],count_yes_no["no"])    
        current_entropy= self.entropy(count_yes_no.get("yes", 0),count_yes_no.get("no", 0),count_yes_no.get("yes", 0)+count_yes_no.get("no", 0))
        #print(current_entropy)
        
        ##################################################################
        classification_feature_split = {}
        decision_stump = {}
        ##################################################################
        for each_class in dict_of_splits.items():
            classification_feature_split = {}
            class_name, data = each_class
            #print(data)
            for list_of_lists in data:
                for list1 in list_of_lists:
                    classification_class = list1[len(list1)-1]
                    if classification_class in classification_feature_split:
                        classification_feature_split[classification_class] += 1
                    else:
                        classification_feature_split[classification_class] = 1
            #print("-------------")
            #print("###################")
            #print(class_name)
            #pprint.pprint(classification_feature_split)
            decision_stump[class_name]=classification_feature_split
            ####################################################
            
        
            ####################################################
            yes_no = []
            '''
            for val in classification_feature_split:
                print(val)
                yes_no.append(classification_feature_split[val])
            print(yes_no)
            print("current entropy:",classification_feature_split["yes"],classification_feature_split["no"],classification_feature_split["yes"]+classification_feature_split["no"]
            
            '''
            e= self.entropy(classification_feature_split.get("yes", 0),classification_feature_split.get("no", 0),classification_feature_split.get("yes", 0)+classification_feature_split.get("no", 0))
                    
            entropy_split[class_name]=e
             
        #print(entropy_split)
        summation = 0
        for value in count_classes_split:
            #print(entropy_split[value],count_classes_split[value])
            summation += entropy_split[value]*(count_classes_split[value]/len(X))
            #print("summation for:",value,":",summation)
        return current_entropy - summation, decision_stump
        
        
    def fit(self, X, Y):
        #print("In fit")
        '''
        The fit method takes in two arguments: X is 2 dimensional numpy array, which is the training 
        instances and Y which is the class labels corresponding to the training instances X.  
        Y will be a one dimensional numpy array. 
        The fit method must take in the training data (X, Y) and build a decision stump.
        '''
        #print("Inside fit")
        if(X is None or Y is None):
            print("X_train or y_train is empty")
        else:
            max_information_gain = 0
            max_ig_feature = 0
            for i in range(X.shape[1]):
                #print(i)
                unique, counts = np.unique(X[:,i], return_counts=True)
                #print(unique)
                ig,decision_stump = self.information_gain(X, Y, i, unique)
                #print("Information gain for feature", i,"is",ig)
                if(ig > max_information_gain):
                    max_information_gain = ig
                    self.max_ig_feature = i
                    self.decision_stump = decision_stump

            #print(max_information_gain,i)
            unique, counts = np.unique(X[:,self.max_ig_feature], return_counts=True)

            #print(unique,self.max_ig_feature)
    
    def predict(self,X_predict):
        #print("In predict",X_predict)
        '''
         The predict method takes in a set of instances X_predict which has the same dimensions
         as the training instances X and will also be a 2-dimensional numpy array. 
         The predict method must output a one dimensional array of the target classes predicted by 
         the decision stump, corresponding to each of the X_predict instances.
        '''
         
        #print(self.decision_stump,self.max_ig_feature)
        decision_stump = {}
        for item in self.decision_stump.items():
            key,value = item
            maxi = 0
            #print(key)
            for key1 in value:
                #print(key1)
                if value[key1]>maxi:
                    maxi = value[key1]
                    maxi_ans = key1
            #print(maxi_ans)
            decision_stump[key]=maxi_ans
            
        y_predict = []
        
        for row in X_predict[:,self.max_ig_feature]:
            y_predict.append(decision_stump[row])
            
        return y_predict 
            

In [485]:
ds = DecisionStump()

In [486]:
ds.fit(X_train,y_train)

In [487]:
ds.predict(X_test)

['yes', 'yes', 'no', 'no', 'no']