In [1]:
import pandas as pd
import numpy as np


In [2]:
dataset = pd.read_csv('project3_dataset2.txt', sep="\t", header=None)
nrows = len(dataset)
ncol = len(dataset.columns)
dataset= np.array(dataset)
print(dataset)

[[132 6.2 6.47 ... 14.14 45 0]
 [123 0.05 4.61 ... 2.78 16 0]
 [128 0.5 3.7 ... 22.73 28 0]
 ...
 [138 4.5 2.85 ... 24.89 56 1]
 [170 7.6 5.5 ... 6.17 54 1]
 [128 0.0 10.58 ... 14.66 48 0]]


In [3]:
def check_termination(data):
    global ncol
    if len(np.unique(data[:,ncol-1]))==1:
        return True
    else:
        return False                                       

In [4]:
def classify(data):
    global ncol
    classes, counts= np.unique(data[:,ncol-1], return_counts=True)
    return classes[np.argmax(counts)]

In [5]:
def get_splits(data):
    global ncol
    global FEATURE_TYPES
    potential_splits = {}
    for i in range(ncol-1):          # excluding the last column which is the label
        unique_values = np.unique(data[:,i])
        if FEATURE_TYPES[i] == "continuous":
            potential_splits[i] = []
            for index in range(len(unique_values)):
                if index != 0:
                    current_value = unique_values[index]
                    previous_value = unique_values[index - 1]
                    potential_split = (current_value + previous_value) / 2

                    potential_splits[i].append(potential_split)
       
        else:
            potential_splits[i] = unique_values
    
    return potential_splits

In [9]:
def split_data(data, column, svalue):
        left= data[data[:,column] <= svalue]
        right= data[data[:,column] >  svalue]  
        return left,right

In [10]:
def change_feature(dataset):
    global FEATURE_TYPES
    for i in range(len(FEATURE_TYPES)):
        if FEATURE_TYPES[i] == "categorical":
            values = np.unique(dataset[:,i]).tolist()
            for j in range(len(dataset[:,i])):
                for k in values:
                    if dataset[j][i] == k:
                        dataset[j][i] = values.index(k)
    return dataset

In [11]:
def feature_types(df):
    feature_types = []
    for i in range(len(df[0])-1):
            value = df[0][i]
            if (isinstance(value, str)):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    return feature_types

In [12]:
def gini(data):
    global ncol
    label_column = data[:,ncol-1]
    classes, counts = np.unique(label_column, return_counts=True)
    prob = counts / counts.sum()
    gini = 1 - sum(np.power(prob,2))
    return gini


In [13]:
def overall_gini_func(left,right):
    n = len(left) + len(right)
    p_left = len(left) / n
    p_right = len(right) / n
    overall_gini =  (p_left * gini(left) 
                      + p_right *gini(right))
    
    return overall_gini

In [14]:
def determine_best_split(data, potential_splits):
    overall_gini = 1
    for i in potential_splits:
        for value in potential_splits[i]:
            left,right = split_data(data, column = i,svalue = value)
            current_overall_gini = overall_gini_func(left,right)
            if current_overall_gini <= overall_gini:
                overall_gini = current_overall_gini
                best_split_column = i
                best_split_value = value
    return best_split_column, best_split_value

In [None]:
print(determine_best_split(dataset,get_splits(dataset)))

In [21]:
def decision_tree(data, depth=0, min_samples=2, max_depth=5):
    
    if (check_termination(data)) or (len(data) < min_samples) or (depth == max_depth):
        return classify(data)
    else:    
        depth+= 1
        potential_splits = get_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        left,right = split_data(data, split_column, split_value)
        condition = "{} <= {}".format(split_column, split_value)
        sub_tree = {condition: []}
        one = decision_tree(left, depth, min_samples, max_depth)
        zero = decision_tree(right, depth, min_samples, max_depth)
        if one == zero:
            sub_tree = one
        else:
            sub_tree[condition].append(one)
            sub_tree[condition].append(zero)
        
        return sub_tree

In [22]:
FEATURE_TYPES = feature_types(dataset)
COLUMN_NUMBERS = [i for i in range(len(dataset[0]))]
dataset1 = change_feature(dataset)
print(decision_tree(dataset1))


{'8 <= 50.5': [{'8 <= 30.5': [{'1 <= 0.51': [{'6 <= 18.48': [{'6 <= 18.175': [0, 1]}, 0]}, {'7 <= 11.105': [0, {'6 <= 25.38': [1, 0]}]}]}, {'5 <= 68.5': [{'5 <= 53.5': [0, {'6 <= 23.240000000000002': [1, 0]}]}, {'6 <= 23.990000000000002': [0, {'0 <= 191.0': [1, 0]}]}]}]}, {'4 <= 0.5': [{'1 <= 7.605': [{'2 <= 10.34': [0, 1]}, {'3 <= 28.955': [1, {'7 <= 7.33': [1, 0]}]}]}, {'2 <= 4.99': [{'3 <= 27.985': [{'0 <= 127.0': [0, 1]}, {'1 <= 4.15': [0, 1]}]}, {'0 <= 121.5': [{'5 <= 61.0': [0, 1]}, {'6 <= 38.875': [1, 0]}]}]}]}]}
