In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
myname = "Ritesh-Gupta-"
#features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']

In [3]:
f = open('dataset/spam_data.txt', 'r')
data = []
for line in f.readlines():
    r = []
    for word in line.split():
        r.append(word)
    data.append(r)
f.close()
data = np.asarray(data)
data = data.astype(float)

In [4]:
length = len(data)
train_data = data[:int(0.7*length),:]
test_data = data[int(0.7*length):,:]
print(train_data.shape)

(3220, 58)


In [5]:
X_test = test_data[:,:-1]
y_test = test_data[:,-1]
print(X_test.shape)
X_tr = train_data[:100,:]
print(X_tr)

(1381, 57)
[[0.000e+00 6.400e-01 6.400e-01 ... 6.100e+01 2.780e+02 1.000e+00]
 [2.100e-01 2.800e-01 5.000e-01 ... 1.010e+02 1.028e+03 1.000e+00]
 [6.000e-02 0.000e+00 7.100e-01 ... 4.850e+02 2.259e+03 1.000e+00]
 ...
 [0.000e+00 4.300e-01 4.300e-01 ... 6.100e+01 2.220e+02 1.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 1.700e+01 1.910e+02 1.000e+00]
 [1.240e+00 4.100e-01 1.240e+00 ... 1.900e+01 1.140e+02 1.000e+00]]


# Finding Unique Element and its count

In [6]:
def unique_count_dict(col):
    unique_elements, counts_elements = np.unique(col, return_counts=True)
    return dict(zip(unique_elements,counts_elements))

# Entropy function e = sum(-p*log2(p))

In [7]:
def entropy(p):
    return -np.sum(np.multiply(p,np.log2(p)))

In [8]:
def impurity(rows):
    count=unique_count_dict(rows[:,-1])
    p = []
    for label in count:
        p.append(count[label]/float(len(rows)))
    #return gini(p)
    return entropy(p)

# Gini Impurity g = 1- sum(pi^2)

In [9]:
def gini(prob):
    impurity=1
    return (1 - np.sum(np.power(prob,2)))

# Information Gain of a column

In [10]:
def info_gain_entropy(current,left,right):
    p =float(len(left))/len(left)+len(right)
    left = np.asarray(left)
    right = np.asarray(right)
    return current-p*impurity(left)-(1-p)*impurity(right)

# Finding Question

In [11]:
class Question:
    def __init__(self,column,value):
        self.column=column
        self.value=value
    def match(self,data):
        value=data[self.column]
        return value>=self.value
    def __repr__(self):
        condition = ">="
        return "Is %s %s %s?" % (features[self.column], condition, str(self.value))

# Partition column based of question

In [12]:
def split(data,val,col):
    true_row,false_row=[],[]
    for row in data:
        if row[col] >= val:
            true_row.append(row)
        else:
            false_row.append(row)
    true_row  = np.asarray(true_row)
    false_row = np.asarray(false_row) 
    return true_row,false_row

# Calculate best gain and Split of dataset

In [13]:
def best_split(rows):
    best_gain=0
    best_question=None
    value = 0
    column = 0
    current=impurity(rows)
    features=len(rows[0])-1
    for col in range(features):
        val = np.average(rows[:,col])
        question = Question(col,val)
        true_rows,false_rows = split(rows,val,col)
        if len(true_rows) == 0 or len(false_rows) == 0:
            continue
        gain=info_gain_entropy(current,true_rows,false_rows)
        if gain>=best_gain:
                best_gain,best_question,value,column=gain,question,val,col
    return best_gain,best_question,value,column

# Class to store decision Node i.e. question of split left and right branch

In [14]:
class DecisionNode:
    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

# Storing Leaf instance with % of occurence of label

In [15]:
class Leaf:
    def __init__(self,rows):
        count = unique_count_dict(rows[:,-1])
        p = {}
        for label in count:
            p[label] = count[label]/float(len(rows))
        self.dict = p

# Building tree recursively

In [16]:
def build_tree(rows):
    gain,ques,val,col=best_split(rows)
    if gain==0:
        return Leaf(rows)
    true_rows, false_rows = split(rows,val,col)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return DecisionNode(ques,true_branch, false_branch)

# Classify the predicted Node

In [17]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.dict
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

# Function to calculate accuracy

In [18]:
def accuracy(result,c):
    pred_label = []
    max1 = 0
    max2 = 0
    val = 0
    for i in range(len(result)):
        d = 0.0
        if len(result[i]) == 2: 
            for k,v in result[i].items():
                #max1 = int(v.replace("%",""))
                max1 = int(v)
                d = k
                if max1 > max2:
                    d = k
        else:
            for k,v in result[i].items():
                d = k
        pred_label.append(d)
    pred_label = np.asarray(pred_label)
    count = 0
    for i in range(len(c)):
        if c[i] == pred_label[i]:
            count +=1
    return (count/len(c)*100)

In [19]:
def randomforest():
    total_feature = train_data.shape[1]-1
    no_of_itr = 10
    acc = []
    total_tree = []
    for itr in range(no_of_itr):
        rand = np.arange(total_feature)
        np.random.shuffle(rand)
        features_to_train = 25
        sample = 100
        np.random.shuffle(train_data)
        X_train = []
        X_train = np.asarray(X_train)
        X_train = train_data[:,rand[0]]
        for i in range(1,features_to_train):
            X_train = np.column_stack((X_train,train_data[:,rand[i]]))
        X_train = np.column_stack((X_train,train_data[:,-1]))
        X_tr = X_train[:sample,:]
        tree = build_tree(X_train)
        total_tree.append(tree)
        result = []
        for i in range(len(X_test)):
            result.append(classify(X_test[i], tree))
        res = accuracy(result,y_test)
        print(res)
        acc.append(res)
    acc = np.asarray(acc)
    return np.max(acc)

In [20]:
randomforest()


69.15278783490224
23.533671252715425
52.78783490224475
56.55322230267922
65.38740043446778
51.04996379435192
43.95365677045619
59.81173062997828
33.1643736422882
55.17740767559739


69.15278783490224

NameError: name 'a' is not defined

In [None]:
r.shape

In [None]:
if __name__ == "__main__":
    run_decision_tree_CV()

In [None]:
tree = build_tree(train_data)
result = []
for i in range(len(X_test)):
    result.append(classify(X_test[i], tree))
accuracy(result,y_test)