In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import scipy.stats as sps
from sklearn import metrics


dataset = pd.read_csv(r'diabetes.csv',header=None)
dataset = dataset.sample(frac=1)
dataset.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age',
           'Outcome']


In [2]:
def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy


########################################################################################################### 
###########################################################################################################


def InfoGain(data,split_attribute_name,target_name="target"):
    
    #Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])
    
    ##Calculate the entropy of the dataset
    
    #Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    
    #Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    
    #Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain
       
###########################################################################################################
###########################################################################################################


def ID3(data,originaldata,features,target_attribute_name="Outcome",parent_node_class = None):
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#
    
    #If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    #If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    
    #If the feature space is empty, return the mode target feature value of the direct parent node --> Note that
    #the direct parent node is that node which has called the current run of the ID3 algorithm and hence
    #the mode target feature value is stored in the parent_node_class variable.
    
    elif len(features) ==0:
        return parent_node_class
    
    #If none of the above holds true, grow the tree!
    
    else:
        #Set the default value for this node --> The mode target feature value of the current node
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        
        
        ################################################################################################################
        ############!!!!!!!!!Implement the subspace sampling. Draw a number of m = sqrt(p) features!!!!!!!!#############
        ###############################################################################################################
        
        
        
        
        features = np.random.choice(features,size=np.int_(np.sqrt(len(features))),replace=False)
        
        #Select the feature which best splits the dataset
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information
        #gain in the first run
        tree = {best_feature:{}}
        
        #Remove the feature with the best inforamtion gain from the feature space
        features = [i for i in features if i != best_feature]
        
        
        #Grow a branch under the root node for each possible value of the root node feature
        
        for value in np.unique(data[best_feature]):
            value = value
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
            subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
            
            #Add the sub tree, grown from the sub_dataset to the tree under the root node
            tree[best_feature][value] = subtree
            
        return(tree)    
    
                
###########################################################################################################
###########################################################################################################

    
def predict(query,tree,default = 'p'):
        
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)

            else:
                return result

        
        

###########################################################################################################
###########################################################################################################

def train_test_split(dataset):
    training_data = dataset.iloc[:round(0.75*len(dataset))].reset_index(drop=True)#We drop the index respectively relabel the index
    #starting form 0, because we do not want to run into errors regarding the row labels / indexes
    testing_data = dataset.iloc[round(0.75*len(dataset)):].reset_index(drop=True)
    return training_data,testing_data


training_data = train_test_split(dataset)[0]
testing_data = train_test_split(dataset)[1] 



###########################################################################################################
###########################################################################################################



In [3]:
#######Train the Random Forest model###########

def RandomForest_Train(dataset,number_of_Trees):
    #Create a list in which the single forests are stored
    random_forest_sub_tree = []
    
    #Create a number of n models
    for i in range(number_of_Trees):
        #Create a number of bootstrap sampled datasets from the original dataset 
        bootstrap_sample = dataset.sample(frac=1,replace=True)
        
        #Create a training and a testing datset by calling the train_test_split function
        bootstrap_training_data = train_test_split(bootstrap_sample)[0]
        bootstrap_testing_data = train_test_split(bootstrap_sample)[1] 
        
        
        #Grow a tree model for each of the training data
        #We implement the subspace sampling in the ID3 algorithm itself. Hence take a look at the ID3 algorithm above!
        random_forest_sub_tree.append(ID3(bootstrap_training_data,bootstrap_training_data,bootstrap_training_data.drop(labels=['Outcome'],axis=1).columns))
        
    return random_forest_sub_tree


        
random_forest = RandomForest_Train(dataset,10)

In [4]:
#######Predict a new query instance###########
def RandomForest_Predict(query,random_forest,default='1'):
    predictions = []
    for tree in random_forest:
        predictions.append(predict(query,tree,default))
    return sps.mode(predictions)[0][0]


query = testing_data.iloc[0,:].drop('Outcome').to_dict()
query_target = testing_data.iloc[0,-1]
print('Outcome ',query_target)
prediction = RandomForest_Predict(query,random_forest)
print('prediction: ',prediction)



Outcome  1
prediction:  0


In [5]:
#######Test the model on the testing data and return the accuracy###########
def RandomForest_Test(data,random_forest):
    data['predictions'] = None
    for i in range(len(data)):
        query = data.iloc[i,:].drop('Outcome').to_dict()
        data.loc[i,'predictions'] = RandomForest_Predict(query,random_forest,default='1')
#     print("Predictions for correct",sum(data['predictions']=='1'))
#     print("Prections for wrong",sum(data['predictions']=='0'))
#     print("Actual Outcome for correct",sum(data['Outcome']=='1'))
#     print("Actual Outcome for wrong",sum(data['Outcome']=='0'))
    accuracy = sum(data['predictions'] == data['Outcome'])/len(data)*100
#     print('The prediction accuracy is: ',sum(data['predictions'] == data['target'])/len(data)*100,'%')
    
    print("Confusion Matrix: ")
    print(metrics.confusion_matrix(data['Outcome'],data['predictions'], labels=["0","1"]))
    print()
    
    print("Whole Classification Report:")
    print(metrics.classification_report(data['Outcome'],data['predictions'], labels=["0","1"]))
    print("################################################################################")
    
    
    return accuracy
        
        
        
RandomForest_Test(testing_data,random_forest)

Confusion Matrix: 
[[115   4]
 [ 37  36]]

Whole Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.97      0.85       119
           1       0.90      0.49      0.64        73

    accuracy                           0.79       192
   macro avg       0.83      0.73      0.74       192
weighted avg       0.81      0.79      0.77       192

################################################################################


78.64583333333334

In [6]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')

In [None]:
##############################################################################################################
##########Plot the prediction accuracy with respect to the number of Trees in the random forests#############
##############################################################################################################


accuracy = []

for i in range(1,11,1):
    random_forest = RandomForest_Train(dataset,i)
    print("No. of Trees: ",i)
    accuracy.append(RandomForest_Test(testing_data,random_forest))

No. of Trees:  1
Confusion Matrix: 
[[82 20]
 [ 8 48]]

Whole Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.69      0.78       119
           1       0.71      0.66      0.68        73

   micro avg       0.82      0.68      0.74       192
   macro avg       0.81      0.67      0.73       192
weighted avg       0.83      0.68      0.75       192

################################################################################
No. of Trees:  2
Confusion Matrix: 
[[116   3]
 [ 51  22]]

Whole Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.97      0.81       119
           1       0.88      0.30      0.45        73

    accuracy                           0.72       192
   macro avg       0.79      0.64      0.63       192
weighted avg       0.77      0.72      0.67       192

################################################################################
No. of Tre

In [None]:
accuracy

In [None]:
fig = plt.figure(figsize=(15,10))

ax0 = fig.add_subplot(111)

ax0.plot(np.logspace(0,1,10),accuracy)
ax0.set_yticks(np.linspace(50,100,50))
ax0.set_title("Proposed Accuracy Random Forest Graph")
ax0.set_xscale('log')
ax0.set_xlabel("Trees")
ax0.set_ylabel('Sensitivity(%)')

plt.show()
