In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

def text_file_retrieve(file):
    with open(file) as f:
        lines = [line.split() for line in f]
    f.close()
    lines = np.asarray(lines)
    return lines

def cross_validation(data):
    k_data = np.array_split(data,10)
    k_data = np.array(k_data)
    return k_data

def categoricDataCols(features):
    typelabel = {}
    typecount = 0
    types = []
    for i in range(features.shape[1]):
      if features[0][i].isalpha():
        col = features[:,i]
        for i, j in enumerate(col):
          if j not in typelabel:
            typelabel[j] = typecount
            typecount += 1
          col[i] = float(typelabel[j])
        types.append(i)
    return types

def dataset_split(dataset, value, column, types):
    left = []
    right = []
    for row in dataset:
      if column not in types and row[column] < value:
        left.append(row)
      elif column in types and row[column] == value:
        left.append(row)
      else:
        right.append(row)
    return np.array(left), np.array(right)

def find_gini_index(left, right):
    gini = 0.0
    if len(left) > 0:
      gini_left = 0.0
      left0 = list(left[:,-1]).count(0)/float(len(left))
      left1 = list(left[:,-1]).count(1)/float(len(left))
      gini_left += left0*left0
      gini_left += left1*left1
      gini += (1.0 - gini_left)*(float(len(left))/float(len(left) + len(right)))
    if len(right) > 0:
      gini_right = 0.0
      right0 = list(right[:,-1]).count(0)/float(len(right))
      right1 = list(right[:,-1]).count(1)/float(len(right))
      gini_right += right0*right0
      gini_right += right1*right1
      gini += (1.0 - gini_right)*(float(len(right))/float(len(left) + len(right)))
    return gini

def find_node_split(data, types):
    node_split = None
    error = float('inf')
    for i in range(data.shape[1]-1):
      for j in data:
        left, right = dataset_split(data, j[i], i, types)
        current_gini_score = find_gini_index(left, right)
        if current_gini_score < error:
          error = current_gini_score
          node_split = {'attr': i, 'value': j[i], 'left': left, 'right': right}
    return node_split

def leaf_nodes(left, right):
    zeroes = ones = 0
    if len(left) > 0:
      zeroes += list(left[:,-1]).count(0)
      ones += list(left[:,-1]).count(1)
    if len(right) > 0:
      zeroes += list(right[:,-1]).count(0)
      ones += list(right[:,-1]).count(1)
    if zeroes < ones:
      return 1 
    else:
      return 0

def decision_tree(node, types):
    left = node['left']
    node.pop('left', None)
    right = node['right']
    node.pop('right', None)
    if len(left) == 0 or len(right) == 0:
      node['left'] = node['right'] = leaf_nodes(left, right)
      return node 
    if len(set(left[:,-1])) == 1:
      node['left'] = leaf_nodes(left, [])
    else:
      node['left'] = decision_tree(find_node_split(left, types), types)
    if len(set(right[:,-1])) == 1:
      node['right'] = leaf_nodes([], right)
    else:
      node['right'] = decision_tree(find_node_split(right, types), types)
    return node

def findRoot(root_node, train, types):
    root_node = find_node_split(train, types)
    root_node = decision_tree(root_node, types)
    return root_node

def train_test_data(k_data, i):
    train = np.array(np.concatenate([y for (x,y) in enumerate(k_data, 0) if x != i], axis = 0))
    test = np.array(k_data[i])
    return train, test

def predict(node, row):
    if row[node['attr']] < node['value']:
      if type(node['left']) is not dict:
        return node['left']
      else:
        return predict(node['left'], row)    
    if type(node['right']) is not dict:
      return node['right']
    else:
      return predict(node['right'], row)

def getPredictedValues(test,predictions, root_node):
    for i in test:
      predictions.append(predict(root_node, i))
    return predictions

def getF1Score(f1score, precision, recall):
    f1score = ((precision*recall)*0.01*2)/(precision + recall)
    return f1score

# True Positive - tp
# True Negative - tn
# False Positive - fp
# False Negative - fn

def performanceMeasures(i, test, accuracy, precision, recall, predictions):
    tp = tn = fp = fn = 0 
    classlabel = list(test[:,-1])
    for i in range(len(classlabel)):
      if classlabel[i] == 1 and  predictions[i] == 1:
          tp += 1
      elif classlabel[i] == 0 and  predictions[i] == 0:
          tn += 1
      elif classlabel[i] == 0 and predictions[i] == 1:
          fp += 1
      else:
          fn += 1
    accuracy += (float((tp + tn)/(tp + fn + fp + tn)))*10
    if (tp+fp) != 0:
      precision += (float((tp)/(tp + fp)))*10
    if (tp+fn) != 0:
      recall += (float((tp)/(tp + fn)))*10
    return accuracy, precision, recall

def main():
    input_file = input("Enter the filename: ")
    dataset = text_file_retrieve(input_file)
    ground_truth = np.array(dataset[:,-1].reshape((len(dataset), 1)), dtype = int)
    types = categoricDataCols(dataset[:,0:-1])
    features = np.array(dataset[:,0:-1], dtype = float)
    data = np.concatenate((features, ground_truth), axis = 1)
    data_split = cross_validation(data)
    accuracy = precision = recall = f1_score = 0.0
    for i in range(10):
      predictions = []
      train, test = train_test_data(data_split, i)
      root_node = {}
      root_node = findRoot(root_node, train, types)
      predictions = getPredictedValues(test, predictions, root_node)
      accuracy, precision, recall = performanceMeasures(i, test, accuracy, precision, recall, predictions)
    f1_score = getF1Score(f1_score, precision, recall)
    print("Accuracy: "+ str(accuracy))
    print("Precision: "+ str(precision))
    print("Recall: "+ str(recall))
    print("F1 score: "+ str(f1_score))
    
main()

