In [7]:
import pandas as pd
headers = ["ID","CT","UCSize","UCShape","MA","SECSize","BN","BC","NN","Mitoses","Diagnosis"]
data = pd.read_csv('breast-cancer-wisconsin.data', na_values='?',    
         header=None, index_col=['ID'], names = headers) 
data = data.reset_index(drop=True)
data = data.fillna(0)
data.describe()

Unnamed: 0,CT,UCSize,UCShape,MA,SECSize,BN,BC,NN,Mitoses,Diagnosis
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.463519,3.437768,2.866953,1.589413,2.689557
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.640708,2.438364,3.053634,1.715078,0.951273
min,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [8]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import random
from pprint import pprint

In [9]:
%matplotlib inline
sns.set_style("darkgrid")

In [10]:
cancer = pd.read_csv('breast-cancer-wisconsin.data', na_values='?',    
         header=None, index_col=['ID'], names = headers) ;
cancer.head(20)

Unnamed: 0_level_0,CT,UCSize,UCShape,MA,SECSize,BN,BC,NN,Mitoses,Diagnosis
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2
1017122,8,10,10,8,7,10.0,9,7,1,4
1018099,1,1,1,1,2,10.0,3,1,1,2
1018561,2,1,2,1,2,1.0,3,1,1,2
1033078,2,1,1,1,2,1.0,1,1,5,2
1033078,4,2,1,1,2,1.0,2,1,1,2


In [15]:
def train_test_split(cancer, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(cancer))

    indices = cancer.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_cancer = cancer.loc[test_indices]
    train_cancer = cancer.drop(test_indices)
    
    return train_cancer, test_cancer

In [16]:
random.seed(0)
train_cancer, test_cancer = train_test_split(cancer, test_size=20)

In [17]:
data = train_cancer.values
data[:5]

array([[ 5.,  1.,  1.,  1.,  2.,  1.,  3.,  1.,  1.,  2.],
       [ 5.,  4.,  4.,  5.,  7., 10.,  3.,  2.,  1.,  2.],
       [ 3.,  1.,  1.,  1.,  2.,  2.,  3.,  1.,  1.,  2.],
       [ 6.,  8.,  8.,  1.,  3.,  4.,  3.,  7.,  1.,  2.],
       [ 4.,  1.,  1.,  3.,  2.,  1.,  3.,  1.,  1.,  2.]])

In [18]:
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [19]:
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

In [21]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):        # excluding the last column which is the label
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)
    
    return potential_splits

In [22]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

In [23]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [24]:
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [25]:
def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

In [27]:
sub_tree = {"question": ["yes_answer", 
                         "no_answer"]}

In [35]:
def determine_type_of_feature(cancer):
    
    feature_types = []
    n_unique_values_treshold = 15
    for feature in cancer.columns:
        if feature != "label":
            unique_values = cancer[feature].unique()
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types

In [36]:
def decision_tree_algorithm(cancer, counter=0):
    
    # data preparations
    if counter == 0:
        data = cancer.values
    else:
        data = cancer           
    
    
    # base cases
    if check_purity(data):
        classification = classify_data(data)
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # instantiate sub-tree
        question = "{} <= {}".format(split_column, split_value)
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter)
        no_answer = decision_tree_algorithm(data_above, counter)
        
        sub_tree[question].append(yes_answer)
        sub_tree[question].append(no_answer)
        
        return sub_tree

In [37]:
tree = decision_tree_algorithm(train_cancer, max_depth=3)
pprint(tree)

TypeError: decision_tree_algorithm() got an unexpected keyword argument 'max_depth'

In [38]:
sub_tree

{'question': ['yes_answer', 'no_answer']}

In [39]:
example = test_cancer.iloc[0]
example

CT           1.0
UCSize       2.0
UCShape      3.0
MA           1.0
SECSize      2.0
BN           1.0
BC           2.0
NN           1.0
Mitoses      1.0
Diagnosis    2.0
Name: 1158405, dtype: float64

In [40]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if comparison_operator == "<=":  # feature is continuous
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [41]:
classify_example(example, tree)

NameError: name 'tree' is not defined

In [42]:
def calculate_accuracy(cancer, tree):

    cancer["classification"] = cancer.apply(classify_example, axis=1, args=(tree,))
    cancer["classification_correct"] = cancer["classification"] == cancer["label"]
    
    accuracy = cancer["classification_correct"].mean()
    
    return accuracy

In [43]:
accuracy = calculate_accuracy(test_cancer, tree)
accuracy

NameError: name 'tree' is not defined