In [1]:
import numpy as np
import pandas as pd
import csv

In [21]:
myname = "Ritesh-Gupta-"
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

# Finding Unique Element and its count

In [3]:
def unique_count_dict(col):
    unique_elements, counts_elements = np.unique(col, return_counts=True)
    return dict(zip(unique_elements,counts_elements))

# Entropy function e = sum(-p*log2(p))

In [4]:
def entropy(p):
    return -np.sum(np.multiply(p,np.log2(p)))

In [5]:
def impurity(rows):
    count=unique_count_dict(rows[:,-1])
    p = []
    for label in count:
        p.append(count[label]/float(len(rows)))
    #return gini(p)
    return entropy(p)

# Gini Impurity g = 1- sum(pi^2)

In [6]:
def gini(prob):
    impurity=1
    return (1 - np.sum(np.power(prob,2)))

# Information Gain of a column

In [7]:
def info_gain_entropy(current,left,right):
    p =float(len(left))/len(left)+len(right)
    left = np.asarray(left)
    right = np.asarray(right)
    return current-p*impurity(left)-(1-p)*impurity(right)

# Finding Question

In [8]:
class Question:
    def __init__(self,column,value):
        self.column=column
        self.value=value
    def match(self,data):
        value=data[self.column]
        return value>=self.value
    def __repr__(self):
        condition = ">="
        return "Is %s %s %s?" % (features[self.column], condition, str(self.value))

# Partition column based of question

In [9]:
def split(data,val,col):
    true_row,false_row=[],[]
    for row in data:
        if row[col] >= val:
            true_row.append(row)
        else:
            false_row.append(row)
    true_row  = np.asarray(true_row)
    false_row = np.asarray(false_row) 
    return true_row,false_row

# Calculate best gain and Split of dataset

In [10]:
def best_split(rows):
    best_gain=0
    best_question=None
    value = 0
    column = 0
    current=impurity(rows)
    features=len(rows[0])-1
    for col in range(features):
        val = np.average(rows[:,col])
        question = Question(col,val)
        true_rows,false_rows = split(rows,val,col)
        if len(true_rows) == 0 or len(false_rows) == 0:
            continue
        gain=info_gain_entropy(current,true_rows,false_rows)
        if gain>=best_gain:
                best_gain,best_question,value,column=gain,question,val,col
    return best_gain,best_question,value,column

# Class to store decision Node i.e. question of split left and right branch

In [11]:
class DecisionNode:
    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

# Storing Leaf instance with % of occurence of label

In [12]:
class Leaf:
    def __init__(self,rows):
        count = unique_count_dict(rows[:,-1])
        p = {}
        for label in count:
            p[label] = count[label]/float(len(rows))
        self.dict = p

# Building tree recursively

In [13]:
def build_tree(rows):
    gain,ques,val,col=best_split(rows)
    if gain==0:
        return Leaf(rows)
    true_rows, false_rows = split(rows,val,col)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return DecisionNode(ques,true_branch, false_branch)

# Classify the predicted Node

In [14]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.dict
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [16]:
def accuracy(result,c):
    pred_label = []
    max1 = 0
    max2 = 0
    val = 0
    for i in range(len(result)):
        d = 0.0
        if len(result[i]) == 2: 
            for k,v in result[i].items():
                #max1 = int(v.replace("%",""))
                max1 = int(v)
                d = k
                if max1 > max2:
                    d = k
        else:
            for k,v in result[i].items():
                d = k
        pred_label.append(d)
    pred_label = np.asarray(pred_label)
    count = 0
    for i in range(len(c)):
        if c[i] == pred_label[i]:
            count +=1
    return (count/len(c)*100)

In [17]:
def run_decision_tree():
    # Load data set
    with open("wine-dataset.csv") as f:
        next(f, None)
        data = [tuple(line) for line in csv.reader(f, delimiter=",")]
    K = 10
    f = open(myname+"result.txt", "w")
    training_set = [x for i, x in enumerate(data) if i % K != 9]
    test_set = [x for i, x in enumerate(data) if i % K == 9]
    training_set = np.asarray(training_set,dtype="float")
    test_set = np.asarray(test_set,dtype="float")
    labels_ground = test_set[:,-1]
    test_set = test_set[:,:11]

    tree = build_tree(training_set)
    result = []
    for i in range(len(test_set)):
        result.append(classify(test_set[i], tree))
    acc = accuracy(result,labels_ground)
    print("accuracy: %.4f" % acc)
    # Writing results to a file (DO NOT CHANGE)
    #f.write("accuracy: %.4f" % acc)
    f.close()


In [22]:
if __name__ == "__main__":
    run_decision_tree()

accuracy: 81.1861


# Decision Tree Cross Validation

In [19]:
def run_decision_tree_CV():
    # Load data set
    with open("wine-dataset.csv") as f:
        next(f, None)
        data = [tuple(line) for line in csv.reader(f, delimiter=",")]
    K = 10
    results = np.zeros(K)
    f = open(myname+"result.txt", "w")
    for itr in range(K):
        print(itr)
        training_set = [x for i, x in enumerate(data) if i % K != itr]
        test_set = [x for i, x in enumerate(data) if i % K == itr]
        training_set = np.asarray(training_set,dtype="float")
        test_set = np.asarray(test_set,dtype="float")
        labels_ground = test_set[:,-1]
        test_set = test_set[:,:11]
        
        tree = build_tree(training_set)
        result = []
        for i in range(len(test_set)):
            result.append(classify(test_set[i], tree))
        acc = accuracy(result,labels_ground)
        results[itr] = acc
        print("accuracy: %.4f" % acc)
    print("final accuracy: %.4f" % np.average(results))
    # Writing results to a file (DO NOT CHANGE)
    #f.write("accuracy: %.4f" % acc)
    f.close()


In [20]:
if __name__ == "__main__":
    run_decision_tree_CV()

0
accuracy: 81.6327
1
accuracy: 83.4694
2
accuracy: 83.8776
3
accuracy: 81.2245
4
accuracy: 85.3061
5
accuracy: 82.8571
6
accuracy: 79.7959
7
accuracy: 81.2245
8
accuracy: 79.7546
9
accuracy: 81.1861
final accuracy: 82.0328
