In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
from random import sample

def text_file_retrieve(file):
    with open(file) as text_file:
      lines = [line.split() for line in text_file]
    lines = np.array(lines)
    return lines

def train_test_data(data,i):
    train = np.array(np.concatenate([y for (x,y) in enumerate(data, 0) if x != i], axis = 0))
    test = np.array(data[i])
    return train, test

def categoricDataCols(features):
    count = 0
    cat_dict = {}
    cat_list = []
    for i in range(features.shape[1]):
      if features[0][i].isalpha():
        column = features[:,i]
        for j,k in enumerate(column):
          if k not in cat_dict:
            cat_dict[k] = count
            count = count + 1
          column[j] = float(cat_dict[k])
        cat_list.append(i)
    return cat_list

def dataset_split(dataset, value, column):
    left = []
    right = []
    for row in dataset:
      if value > row[column]:
        left.append(row)
      else:
        right.append(row)
    return np.array(left), np.array(right)

def leaf_nodes(left, right, no_of_features):
    no_of_zeroes = no_of_ones = 0
    if 0 < len(left):
      no_of_zeroes += list(left[:,-1]).count(0)
      no_of_ones += list(left[:,-1]).count(1)
    if 0 < len(right):
      no_of_zeroes += list(right[:,-1]).count(0)
      no_of_ones += list(right[:,-1]).count(1)
    if no_of_zeroes < no_of_ones:
      return 1  
    else:
      return 0

def find_final_gini_index(node, final_gini, size):
    node_score = 0.0
    node_zero_class = list(node[:,-1]).count(0)/float(len(node))
    node_one_class = list(node[:,-1]).count(1)/float(len(node))
    node_score += node_zero_class*node_zero_class
    node_score += node_one_class*node_one_class
    final_gini += (1.0-node_score)*(float(len(node))/size)
    return final_gini

def find_gini_index(left, right, size):
    final_gini_index = 0.0
    if len(left) > 0:
      final_gini_index = find_final_gini_index(left, final_gini_index, size)
    if len(right) > 0:
      final_gini_index = find_final_gini_index(right, final_gini_index, size)
    return final_gini_index

def find_node_split(data, no_of_features):
    node_split = None
    data_columns = [i for i in range(data.shape[1]-1)]
    sampled_columns = sample(data_columns, no_of_features)
    error = float('inf')
    for i in sampled_columns:
      for j in data:
        left, right = dataset_split(data, j[i], i)
        size = float(len(left) + len(right))
        current_gini_score = find_gini_index(left, right, size)
        if current_gini_score < error:
          error = current_gini_score
          node_split = {'attr': i, 'value': j[i], 'left': left, 'right': right}
    return node_split

def decision_tree(node, no_of_features):
    left = node['left']
    node.pop('left',None)
    right = node['right']
    node.pop('right',None)
    if len(left) == 0 or len(right) == 0:
      node['left'] = node['right'] = leaf_nodes(left, right, no_of_features)
      return node 
    if len(set(left[:,-1])) == 1:
      node['left'] = leaf_nodes(left, [], no_of_features)
    else:
      node['left'] = decision_tree(find_node_split(left, no_of_features), no_of_features)
    if len(set(right[:,-1])) == 1:
      node['right'] = leaf_nodes([], right, no_of_features)
    else:
      node['right'] = decision_tree(find_node_split(right, no_of_features), no_of_features)
    return node

def cross_validation(data):
    k_data = np.array_split(data,10)
    k_data = np.array(k_data)
    return k_data

def findRoot(root_node, train, types):
    root_node = find_node_split(train, types)
    root_node = decision_tree(root_node, types)
    return root_node

def predict(node, row):
    if row[node['attr']] < node['value']:
      if type(node['left']) is not dict:
        return node['left']
      else:
        return predict(node['left'], row)    
    if type(node['right']) is not dict:
      return node['right']
    else:
      return predict(node['right'], row)

def getPredictedValues(rf_pred, root_node, test):
    for i in range(len(test)):
      rf_pred[i].append(predict(root_node, test[i]))
    return rf_pred

def getFinalPredictedValues(rf_pred, finalPredictedValues):
    for i,j in rf_pred.items():
      finalPredictedValues.append(max(set(j), key = j.count))
    return finalPredictedValues

def random_forest(train , test, no_of_features, no_of_trees):
    rf_pred = defaultdict(list)
    for i in range((no_of_trees)):
      root_node = {}
      sampled_rows = np.random.choice(len(train), len(train), replace = True)
      train_subset = train[sampled_rows,:]
      root_node = findRoot(root_node, train_subset, no_of_features)
      rf_pred = getPredictedValues(rf_pred,root_node,test)
    finalPredictedValues = []
    finalPredictedValues = getFinalPredictedValues(rf_pred,finalPredictedValues)
    return finalPredictedValues

def performanceMeasures(i,accuracy,precision,recall,true_class,predictions):
    tp = fp = tn = fn = 0 
    for i in range(len(true_class)):
      if true_class[i] == 1 and  predictions[i] == 1:
        tp += 1
      elif true_class[i] == 0 and  predictions[i] == 0:
        tn += 1
      elif true_class[i] == 0 and predictions[i] == 1:
        fp += 1
      else:
        fn += 1
    accuracy += (float((tp + tn)/(tp + fn + fp + tn)))*10
    if (tp + fn) != 0:
      recall += (float((tp)/(tp + fn)))*10
    if (tp + fp) != 0:
      precision += (float((tp)/(tp + fp)))*10
    return accuracy, precision, recall

def getF1Score(f1_score, precision, recall):
    f1_score = 0.01*2*(precision * recall)/(precision + recall)
    return f1_score

def main():
    input_file = input("Enter the filename: ")
    dataset = text_file_retrieve(input_file)
    features = dataset[:,0:-1]
    types = categoricDataCols(dataset[:,0:-1])
    ground_truth = np.array(dataset[:,-1].reshape((len(dataset),1)),dtype = int)
    features = np.array(features, dtype=float)
    data = np.concatenate((features,ground_truth),axis = 1)
    data_split = cross_validation(data)
    accuracy = precision = recall = f1_score = 0.0
    no_of_features = 2                        # please enter number of features here 
    no_of_trees = int(input("Enter the number of trees: "))    
    for i in range(10):
      predictions = []
      train, test = train_test_data(data_split,i)
      predictions = random_forest(train, test, no_of_features, no_of_trees)
      classlabel = list(test[:,-1])
      tp = tn = fp = fn = 0
      accuracy, precision, recall = performanceMeasures(i,accuracy,precision,recall,classlabel,predictions) 
    
    f1_score = getF1Score(f1_score, precision, recall)
    print("Accuracy: "+ str(accuracy))
    print("Precision: "+ str(precision))
    print("Recall: "+ str(recall))
    print("F1 Measure: "+ str(f1_score))

main() 

Enter the filename: /content/project3_dataset1 (2).txt
Enter the number of trees: 3
Accuracy: 94.01942355889724
Precision: 95.7648332788493
Recall: 88.78155929236698
F1 Measure: 0.9214107201351905
