<a href="https://colab.research.google.com/github/pperezg/ST0245-032/blob/master/proyecto/codigo/finalBinaryClassificationTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files
uploaded = files.upload()

Saving data_set_train.csv to data_set_train.csv


In [3]:
from google.colab import files
uploaded = files.upload()

Saving data_set_test.csv to data_set_test.csv


In [12]:
import pandas as pd

trainingData = pd.read_csv('data_set_train.csv', delimiter = ',')
header = []

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

class Leaf:
    def __init__(self, rows):
        totals = {}
        labels = set([row[-1] for row in rows])
        for label in labels:
            totals[label] = 0
        for row in rows:
            if row[-1]==label:
                totals[label]+=1
        self.predictions = totals

class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain==0:
        return Leaf(rows)
    
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)

    return Decision_Node(question,true_branch,false_branch)

def classify(row, node):
    if type(node)==Leaf:
        return node.predictions
    
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

def unique_vals(rows, col):
    return set([row[col] for row in rows]) #Sets work just like lists but without element duplicates.

def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict", node.predictions)
        return
    print(spacing + str(node.question))
    print(spacing + "--> True:")
    print_tree(node.true_branch, spacing + "  ")
    print(spacing + "--> False:")
    print_tree(node.false_branch, spacing + "  ")

def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + '%'
    return probs


def is_numeric(value):
    if type(value)==int or type(value)==float:
        return True
    else:
        return False

def partition(rows, question):
    truthOnes = []
    falseOnes = []
    for row in rows:
        if question.match(row):
            truthOnes.append(row)
        else:
            falseOnes.append(row)
    return truthOnes, falseOnes

def gini(rows):
    maximo = 1
    totals = {}
    labels = set([row[-1] for row in rows])
    for label in labels:
        totals[label] = 0
        for row in rows:
            if row[-1]==label:
                totals[label]+=1
    for label in totals:
        chances = totals[label]/len(rows)
        chsqrd = chances**2
        maximo = maximo - chsqrd
    return maximo
                

def info_gain(left, right, current_uncertainty):
    inTotal = len(left)+len(right)
    prcntg_left = len(left)/inTotal
    prcntg_right = len(right)/inTotal
    GainLeft = gini(left)
    GainRight = gini(right)
    return (current_uncertainty-(prcntg_left*GainLeft)-(prcntg_right-GainRight))

def find_best_split(rows):
    best_gain = 0
    best_question = None
    uncertainty = gini(rows)
    columns = len(rows[0])

    for column_ in range (0,columns):
        values = set(row[column_] for row in rows)
        for val in values:
            question = Question(column_, val)
            true_rows, false_rows = partition(rows, question)
            if len(true_rows)==0 or len(false_rows)==0:
                continue
            gain = info_gain(true_rows, false_rows, uncertainty)
            if gain>=best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

if __name__ == '__main__':
    header = trainingData.columns.values.tolist()
    listOfTrainingData = trainingData.values.tolist()
    tree = build_tree(listOfTrainingData)

    testingData = pd.read_csv('data_set_test.csv', delimiter = ',')
    listOfTestingData = testingData.values.tolist()
    real_results = []
    for row in listOfTestingData:
      real_results.append(row[-1])
    classified = []
    for row in listOfTestingData:
        for n in print_leaf(classify(row, tree)):
          if "100" in print_leaf(classify(row, tree))[n]:
            classified.append(n)
    counted = len(real_results)
    goodOnes = 0
    for n in range (0,counted):
      if real_results[n]==classified[n]:
        goodOnes += 1
    print('{}% de aciertos.'.format(goodOnes/counted))

0.7966666666666666% de aciertos.
