<a href="https://colab.research.google.com/github/rposhala/Random-Forest-Algorithm-using-Python/blob/master/Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#loading libraries
import numpy as np
import pandas as pd
from pprint import pprint

# Random forest algorithm code using NumPy ( we can also look at decision trees generated by printing output of decisiontree(df))

In [0]:
def RandomForest(X_train,Y_train,X_test):
    

    Y_train = np.asarray([Y_train])
    Y_train = np.swapaxes(Y_train,0,1)
    df = np.append(X_train,Y_train,axis=1)

    ## to calculate gini index
    def gini(df):
        x_unique,x_count = np.unique(df[:,-1], return_counts=True)
        x_prob = x_count/np.sum(x_count)
        x_gini = 1 - (np.dot(x_prob,x_prob))
        return x_gini

    ## function picks the best value to split the real valued attribute 
    def pick_the_value(df,n):
        n_best = []
        loss = 1000 # loss would not be more than 1 as gini is atmost 0.5
        unique = np.sort(np.unique(df[:,n]))
        split_values = [((unique[index]+unique[index-1])/2) for index in range(1,len(unique))]
        if len(split_values) > 100 :
            strip = list(range(1,100))
            steps = np.ptp(df[:,n])/100
            mini = np.min(df[:,n])
            split_values = mini+(np.asarray(strip)*steps)

        for i in split_values:
            above = df[df[:,n] > i]
            below = df[df[:,n] <= i] 
            a_gini = gini(above)
            b_gini = gini(below)
            a_len = len(above)
            b_len = len(below)
            current_loss = ((a_len/(a_len+b_len))*a_gini) + ((b_len/(a_len+b_len))*b_gini)
            
            if current_loss < loss:
                loss = current_loss
                n_best = [i,loss,above,below,n]
        return n_best

    ## picks the best node(attribute) to split
    def decision_node(df):
        node_split = []
        par_loss = 1000
        col_no = df.shape[1] - 1
        par_list = np.random.choice(col_no,int(np.floor(np.sqrt(col_no))),replace=False)
        for i in par_list:
            i_best = pick_the_value(df,i)

            if len(i_best)!=0 :
                if i_best[1] < par_loss:
                    par_loss = i_best[1]
                    node_split = i_best
        return node_split

    ## displaying the decision tree with tree pruning option (we can decide till which height the tree can go before stop, it decides the label based on the majority)
    def decisiontree(df):#,c = 0, m = 3):
        x_unique = np.unique(df[:,-1])
        if len(x_unique) == 1:# or c == m:
            return x_unique[0]
        else :
            # c+=1
            split = decision_node(df)
            col_no = split[4]
            value = split[0]
            df_above = split[2]
            df_below = split[3]
            condition = "{} <= {}".format(col_no,value)
            decision_tree = {condition : []}
            true = decisiontree(df_below)#,c,m)
            false = decisiontree(df_above)#,c,m)

            decision_tree[condition].append(true)
            decision_tree[condition].append(false)

            return decision_tree

    ## bootstrapping the datasets to generate multiple decision trees 
    def bootstrap_decision_tree(df):
        list_bootstrapped_tree = []
        n = 15 ## number of bootstrapped datasets need to be created
        for i in range(n):
            index = np.random.choice(len(df),len(df),replace=True)
            list_bootstrapped_tree.append(decisiontree(df[index]))
        return list_bootstrapped_tree

    ## test sample is being passed through the decision tree and label is assigned
    def classify(test_sample,tree):
        condition = list(tree.keys())[0]
        column_number = condition.split()[0]
        value = condition.split()[2]
        if test_sample[int(column_number)] <= float(value):
            label = tree[condition][0]
        else :
            label = tree[condition][1]
        if type(label) == dict:
            sub_tree = label
            return classify(test_sample,sub_tree)
        else :
            return label

    ## label for a test sample is picked based on majority among the labels generated from bootstrapped decision trees for that test sample
    def predict(x_test,list_bootstrapped_tree):
        y_pred = []#np.asarray([])
        for test in x_test:
            label_list = []
            for i in list_bootstrapped_tree:
                label_list.append(classify(test,i))
            unique_labels,count = np.unique(label_list,return_counts=True)
            y_pred.append(unique_labels[np.argmax(count)])
        return y_pred
    list_bootstrapped_tree = bootstrap_decision_tree(df)
    y_pred = predict(X_test,list_bootstrapped_tree)


    return y_pred

# Function defined to calculate Confusion Matrix

In [0]:
def ConfusionMatrix(y_true,y_pred):
    
    
    classes = 11
    modified_list = ((y_true-1)*classes) + (y_pred-1)
    squared_no_of_classes = classes**2
    confusion_matrix = np.histogram(modified_list, bins=np.arange(squared_no_of_classes+1))[0]
    confusion_matrix = confusion_matrix.reshape(classes,classes)
    return confusion_matrix

# Function defined to calculate Accuracy

In [0]:
def Accuracy(y_true,y_pred):

    nppred = np.asarray(y_pred)
    c = y_true - nppred
    misclass = np.count_nonzero(c)
    test_len = len(y_true)
    accuracy = 1 - (misclass/test_len)
    return accuracy
    

# Function defined to calculate Recall

In [0]:
def Recall(y_true,y_pred):

    confusion_matrix = ConfusionMatrix(y_true,y_pred)
    macro_recall = np.sum(np.divide(np.diag(confusion_matrix),np.sum(confusion_matrix,axis=1)))/len(confusion_matrix)
    return macro_recall

# Function defined to calculate Precision

In [0]:

def Precision(y_true,y_pred):

    confusion_matrix = ConfusionMatrix(y_true,y_pred)
    macro_precision = np.sum(np.divide(np.diag(confusion_matrix),np.sum(confusion_matrix,axis=0)))/len(confusion_matrix)
    return macro_precision

# Loading a dataset using Pandas to test the Random Forest algorithm

In [0]:
# np.random.seed(4563)
data = pd.read_csv("data.csv", header = None , skiprows = 1)
dataframe = pd.DataFrame()
dataframe = pd.DataFrame(data)
Y = dataframe.pop(dataframe.columns[-1])
ind = np.random.choice(len(dataframe),len(dataframe)*8//10,replace=False)
df = np.asarray(dataframe)
label = np.asarray(Y)
X_train = df[ind]
X_test = np.delete(df,ind,axis = 0)
Y_train = label[ind]
Y_test = np.delete(label,ind,axis = 0)
# train_len = len(x_train)
# test_len = len(x_test)
# col_len = x_train.shape[1]
x_train_norm = (X_train - np.mean(X_train,axis = 0)[np.newaxis,:])/np.std(X_train,axis=0)[np.newaxis,:]
x_test_norm = (X_test - np.mean(X_test,axis = 0)[np.newaxis,:])/np.std(X_test,axis=0)[np.newaxis,:]

# Displaying calculated Accuracy, Precision, Recall

In [0]:
y_pred_rf = RandomForest(X_train,Y_train,X_test)
print("Accuracy :",Accuracy(Y_test,y_pred_rf))
print("Precision :",Precision(Y_test,y_pred_rf))
print("Recall",Recall(Y_test,y_pred_rf))

*************************** The end ***************************

..

..