## Decision Tree for Spambase

In [113]:
import numpy as np
import pandas as pd

### Get data

In [114]:
def get_data(column_names):
    data_frame = pd.read_csv('./data/spambase.txt', sep = ',')
    data_frame.columns = column_names
    
    return data_frame

In [115]:
column_names = ['word_freq_make','word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 
               'word_freq_over','word_freq_remove','word_freq_internet','word_freq_order','word_freq_mail',
               'word_freq_receive','word_freq_will','word_freq_people','word_freq_report','word_freq_addresses',
               'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you',
               'word_freq_credit','word_freq_your','word_freq_font','word_freq_000','word_freq_money',
               'word_freq_hp','word_freq_hpl','word_freq_george','word_freq_650','word_freq_lab',
               'word_freq_labs','word_freq_telnet','word_freq_857','word_freq_data','word_freq_415',
               'word_freq_85','word_freq_technology','word_freq_1999','word_freq_parts','word_freq_pm',
               'word_freq_direct','word_freq_cs','word_freq_meeting','word_freq_original','word_freq_project',
               'word_freq_re','word_freq_edu','word_freq_table','word_freq_conference','char_freq_;',
               'char_freq_(','char_freq_[','char_freq_!','char_freq_$','char_freq_#','capital_run_length_average',
               'capital_run_length_longest','capital_run_length_total','spam_label']
dataframe = get_data(column_names)

In [116]:
def normalize(dataset):
    
    maxs = dataset.max()
    mins = dataset.min()
    
    for feature in dataset.columns[:-1]:        
        for i, entry in dataset.iterrows():
            dataset.at[i, feature] = (entry[feature] - mins[feature]) / (maxs[feature] - mins[feature])
            
    return dataset

In [117]:
dataframe = dataframe.sample(frac = 1)
dataframe = normalize(dataframe)

In [118]:
dataframe.describe()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam_label
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,...,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,0.023034,0.01491,0.055015,0.001529,0.031222,0.016313,0.015713,0.00948,0.017127,0.013172,...,0.008799,0.01426,0.004161,0.008281,0.012632,0.002231,0.003806,0.000217,0.000217,0.393913
std,0.067266,0.090385,0.098857,0.032593,0.067259,0.046573,0.053849,0.036104,0.052974,0.035468,...,0.05553,0.027725,0.026809,0.025116,0.040964,0.021655,0.028809,0.014744,0.014744,0.488669
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000534,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.006665,0.0,0.0,0.0,0.0,0.001158,0.0,0.0,0.0
75%,0.0,0.0,0.082353,0.0,0.03825,0.0,0.0,0.0,0.0,0.008801,...,0.0,0.019278,0.0,0.009676,0.008662,0.0,0.002456,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [119]:
def get_thresholds(dataset, feature):    
    ts = []

    for entry in range(len(dataset) - 1):
        ts.append((dataset.iloc[entry][feature] + dataset.iloc[entry+1][feature]) / 2)

    return ts

In [120]:
def get_best_split(dataset):
    
    best_feature = 'word_freq_make'
    best_threshold = 0
    max_info_gain = 0
    
    gini_before = gini(dataset)
    
    for feature in dataset.columns[:-1]:
        
        thresholds = get_thresholds(dataset, feature)
        
        for threshold in thresholds:
            left, right = split_data(dataset, feature, threshold)
            
            if len(left) == 0 or len(right) == 0:
                continue
                
            left_gini = gini(left)
            right_gini = gini(right)
                        
            gini_after = (left_gini + right_gini)
            info_gain = gini_before - gini_after

            if max_info_gain <= info_gain:
                max_info_gain = info_gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold, max_info_gain

In [121]:
class Terminal:

    def __init__(self, dataset):
        self.prediction = dataset.spam_label.mode()[0]

    def predict(self):
        return self.prediction

In [122]:
class Node:

    def __init__(self, feature, threshold, left_node, right_node):
        self.feature = feature
        self.threshold = threshold
        self.left_node = left_node
        self.right_node = right_node

In [123]:
def get_value_count(dataset):
    return dataset.spam_label.value_counts()

In [124]:
def gini(dataset):
    counts = get_value_count(dataset)
    
    imp = 1
    for x in counts:
        prob = x / len(dataset)
        imp -= prob**2
        
    return imp

In [125]:
def split_data(dataset, feature, threshold):    
    
    left = dataset[dataset[feature] < threshold]
    right = dataset[dataset[feature] >= threshold]
    
    return left, right

In [128]:
def build_tree(dataset, depth, max_depth = 5):
      
    best_feature, best_threshold, info_gain = get_best_split(dataset)
    
    if info_gain == 0 or depth >= max_depth:
        return Terminal(dataset)
    
    left_data, right_data = split_data(dataset, best_feature, best_threshold)

    left_node = build_tree(left_data, depth+1)
    right_node = build_tree(right_data, depth+1)

    return Node(best_feature, best_threshold, left_node, right_node)

In [129]:
def get_kfold(dataframe, num_folds):
    
    test_data_size = int(20 * len(dataframe) / 100)
    test_data = dataframe[:test_data_size]
    
    dataframe = dataframe[test_data_size:]
    
    fold_size = int(len(dataframe) / num_folds)
    folds = []
    
    start = 0
    
    for fold in range(num_folds):
        folds.append(dataframe[start:start+fold_size])
        start += fold_size
    
    return folds, test_data

In [135]:
# folds, test_data = get_kfold(dataframe, num_folds = 5)
test_data_size = int(20 * len(dataframe) / 100)
test_data = dataframe[:test_data_size]

dataframe = dataframe[test_data_size:]

In [136]:
def break_folds(folds):
    models = {}
    acc = {}
    
    for k in range(len(folds)):
        validate = folds[k]
            
        train = pd.DataFrame()
        for f in folds:
            if not f.equals(validate):
                train = pd.concat([train, f])
        
        model = build_tree(train, 0, 5)
        models[k] = model
        acc[k] = test_model(model, validate)
        
    return models, acc

In [137]:
def predict(root, entry):
    
    if isinstance(root, Terminal):
        return root.predict()
    
    if entry[root.feature] < root.threshold:
        result = predict(root.left_node, entry)
    else:
        result = predict(root.right_node, entry)
        
    return result

In [138]:
def test_model(model, test_data):
    
    predictions = []
    
    for i, entry in test_data.iterrows():
        predictions.append(predict(model, entry))
    
    correct_count = 0
    
    for i,p in enumerate(predictions):
        if test_data.iloc[i]['spam_label'] == p:
            correct_count += 1
        
    return correct_count / len(test_data)

In [139]:
model = build_tree(dataframe, 0, 5)

In [140]:
acc = test_model(model, dataframe)
print('Accuracy: {}'.format(acc))

Accuracy: 0.6345108695652174


In [112]:
models, acc = break_folds(folds)
acc

NameError: name 'test_model' is not defined

In [87]:
acc = test_model(model, dataframe)
print('Accuracy: {}'.format(acc))

<class 'list'>
Accuracy: 0.6060869565217392


0