In [1]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import random
from pprint import pprint

In [2]:
word_labels = ["make", "address", "all", "3d", "our", "over", "remove", "internet",
                "order", "mail", "receive", "will", "people", "report", "addresses",
                "free", "business", "email", "you", "credit", "your", "font", "000",
                "money", "hp", "hpl", "george", "650", "lab", "labs", "telnet", "857",
                "data", "415", "85", "technology", "1999", "parts", "pm", "direct", "cs",
                "meeting", "original", "project", "re", "edu", "table", "conference", "char_freq1", "char_freq2", "char_freq3", 
              "char_freq4", "char_freq5", "char_freq6", "cap_run_length_avg", "cap_run_length_longest", "cap_run_length_total", "label"]
df = pd.read_csv("spambase/spambase.data", names = word_labels, header=None) 

In [3]:
df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
make                      4601 non-null float64
address                   4601 non-null float64
all                       4601 non-null float64
3d                        4601 non-null float64
our                       4601 non-null float64
over                      4601 non-null float64
remove                    4601 non-null float64
internet                  4601 non-null float64
order                     4601 non-null float64
mail                      4601 non-null float64
receive                   4601 non-null float64
will                      4601 non-null float64
people                    4601 non-null float64
report                    4601 non-null float64
addresses                 4601 non-null float64
free                      4601 non-null float64
business                  4601 non-null float64
email                     4601 non-null float64
you                       460

In [5]:
df['label'].replace(0, 'non-spam', inplace=True)
df['label'].replace(1, 'spam', inplace=True)
df.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq1,char_freq2,char_freq3,char_freq4,char_freq5,char_freq6,cap_run_length_avg,cap_run_length_longest,cap_run_length_total,label
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,spam
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,spam
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,spam
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,spam
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,spam


### Splitting the data into test and train

In [6]:
# Splitting the data set into train and test
def train_test_split(df, test_size):
    test_size = round(test_size* len(df))
    index_list = df.index.tolist()
    test_index = random.sample(population = index_list, k = test_size)
    test_df = df.loc[test_index]
    train_df = df.drop(test_index)
    return train_df, test_df

In [7]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size=0.20)

In [8]:
# Converting the pandas dataframe into numpy 2D array
data = train_df.values

In [9]:
data[:5]

array([[0.0, 0.0, 0.0, 0.0, 0.63, 0.0, 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.0, 0.0, 0.31, 0.0, 0.0, 3.18, 0.0, 0.31, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.13699999999999998, 0.0, 0.13699999999999998, 0.0, 0.0, 3.537,
        40, 191, 'spam'],
       [0.0, 0.0, 0.0, 0.0, 0.63, 0.0, 0.31, 0.63, 0.31, 0.63, 0.31,
        0.31, 0.31, 0.0, 0.0, 0.31, 0.0, 0.0, 3.18, 0.0, 0.31, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.135, 0.0, 0.135, 0.0, 0.0, 3.537, 40, 191, 'spam'],
       [0.0, 0.0, 0.0, 0.0, 1.85, 0.0, 0.0, 1.85, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [10]:
# check if the split makes sense(i.e if one of the bucket after split contains the unmixed data)
# returns a boolean true if the split makes sense
def check_purity(data):
    labels = data[:, -1]
    uni_labels = np.unique(labels)
    if(len(uni_labels) == 1):
        return True
    else:
        return False

In [11]:
def data_classifier(data):
    #fetch the labels
    labels = data[:, -1]
    # unique classes into the bucket and data count for each class
    classes, class_data_count = np.unique(labels, return_counts=True)
    # finding what is maximum
    max_data_class = class_data_count.argmax()
    # class which has maximum data
    classification = classes[max_data_class]
    return classification

In [12]:
def get_potential_splits(data):
    potential_splits = {}
    _n , column_num = data.shape
    for col_ind in range(column_num - 1):
        potential_splits[col_ind] = []
        values = data[:, col_ind]
        uni_values = np.unique(values)
        #unique values from each column
        for ind in range (len(uni_values)):
            if ind != 0:
                curr_value = uni_values[ind]
                prev_value = uni_values[ind - 1]
                potential_split = (curr_value + prev_value) / 2
                potential_splits[col_ind].append(potential_split)
                
    return potential_splits
                

In [13]:
def split_data(data, split_column, split_value):
    split_col_val = data[:, split_column]
    data_below = data[split_col_val <= split_value]
    data_above = data[split_col_val > split_value]
    return data_below, data_above


In [14]:
def calculate_entropy(data):
    labels = data[:, -1]
    _, class_data_count = np.unique(labels, return_counts = True)
    probability = class_data_count/class_data_count.sum()
    entropy = sum(probability * -np.log2(probability))
    return entropy

In [15]:
def total_entropy(data_below, data_above):
    total_data_obj = len(data_below) + len(data_above)
    weigth_data_below = len(data_below) / total_data_obj
    weigth_data_above = len(data_above) / total_data_obj
    total_entropy = (weigth_data_below * calculate_entropy(data_below) + weigth_data_above * calculate_entropy(data_above))
    return total_entropy

In [16]:
#look at the potential splits and determine the split that results in lowest overall entropy
def determine_best_split(data, potential_splits):
    lowest_total_entropy = 999
    global best_split_column
    global best_split_value
    for col_ind in potential_splits:
        for val in potential_splits[col_ind]:
            data_below, data_above = split_data(data, split_column=col_ind, split_value=val)
            curr_total_entropy = total_entropy(data_below, data_above)
            if curr_total_entropy <= lowest_total_entropy:
                lowest_total_entropy = curr_total_entropy
                best_split_column = col_ind
                best_split_value = val
                
    return best_split_column, best_split_value

## Algorithm

In [17]:
# sub_tree = {deciding_factor: [true_ans, false_ans]}

In [18]:
def decision_tree(df, counter = 0):
    if(counter == 0):
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df
        
    if check_purity(data):
        classification = data_classifier(data)
        return classification
    
    else:
        counter = counter + 1;
        
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        feature_name = COLUMN_HEADERS[split_column]
        decision_factor = "{} <= {}".format(feature_name, split_value)
        
        sub_tree = {decision_factor: []}
        
        true_ans = decision_tree(data_below, counter)
        false_ans = decision_tree(data_above, counter)
        
        sub_tree[decision_factor].append(true_ans)
        sub_tree[decision_factor].append(false_ans)
        
        return sub_tree

In [None]:
tree = decision_tree(train_df)
tree