<a href="https://colab.research.google.com/github/jrCruz82/data_minor/blob/Dtree/DecisionTreeProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [302]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
import math


In [303]:
df=pd.read_csv('heart_failure_clinical_records_dataset.csv')
df['DEATH_EVENT'].value_counts()

0    203
1     96
Name: DEATH_EVENT, dtype: int64

In [304]:
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [305]:
X = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']].values


In [306]:
y = df['DEATH_EVENT'].values


In [307]:
def check_purity(data):
    label_column = data[:,-1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

def classify_data(data):
    label_column = data[:,-1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    return classification

def get_potential_splits(data):
    potential_splits = {}
    _, n_columns = data.shape
    
    for column_index in range(n_columns - 1):
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)
        #potential_splits[column_index] = unique_values
        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)        
    return potential_splits

def split_data(data, split_column, split_value):
    
    split_column_values = data[:,split_column]
    
    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values > split_value]

    return data_below, data_above

def calculate_entropy(data):
    label_column = data[:,-1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
    return entropy

def calculate_overall_entropy(data_below, data_above):
    n_data_points = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points

    overall_entropy = (p_data_below * calculate_entropy(data_below)) + (p_data_above * calculate_entropy(data_above))

    return overall_entropy

def determine_best_split(data, potential_splits):
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below,data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    return best_split_column, best_split_value

def decision_tree_algorithm(df, counter=0, min_samples = 2, max_depth=10):

    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df

    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        return classification
    else:
        counter +=1
        
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)

        feature_name = COLUMN_HEADERS[split_column]
        question = '{} <= {}'.format(feature_name, split_value)
        sub_tree = {question:[]}

        yes_answer = decision_tree_algorithm(data_below, counter,min_samples,max_depth)
        no_answer = decision_tree_algorithm(data_above, counter,min_samples,max_depth)

        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)

        return sub_tree        

def predict_example(example, tree):
    
    # tree is just a root node
    if not isinstance(tree, dict):
        return tree
    
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return predict_example(example, residual_tree)

def make_predictions(df, tree):
    
    if len(df) != 0:
        predictions = df.apply(predict_example, args=(tree,), axis=1)
    else:
        # "df.apply()"" with empty dataframe returns an empty dataframe,
        # but "predictions" should be a series instead
        predictions = pd.Series()
        
    return predictions

def calculate_accuracy(df, tree):
    predictions = make_predictions(df, tree)
    predictions_correct = predictions == df.DEATH_EVENT
    accuracy = predictions_correct.mean()
    
    return accuracy

In [308]:
kfold = KFold(n_splits=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
print('Train set:', X_train.shape, y_train.shape)
print('Test set:', X_test.shape, y_test.shape)

Train set: (224, 12) (224,)
Test set: (75, 12) (75,)


In [309]:
dtc = DecisionTreeClassifier(random_state=0)
tree_model = dtc.fit(X_train,y_train)
kfold.split(X_train)
dtc_accuracy_model = []
dtc_model_time = []
mydtc_accuracy_model = []
mydtc_model_time = []
for train_index, test_index in kfold.split(X):
    
    # Split train-test
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    start = int(round(time.time() * 1000))
    # Train the model
    tree_model = dtc.fit(X_train, y_train)
    # Append to accuracy_model the accuracy of the model
    dtc_accuracy_model.append(accuracy_score(y_test, tree_model.predict(X_test), normalize=True)*100)
    time_dtc = int(round(time.time() * 1000)) - start
    dtc_model_time.append(time_dtc)

    
    #X_train = np.insert(X_train,-1,y_train,axis=1)
    #X_train = pd.DataFrame(X_train)
    X_train.columns = df.columns
    start = int(round(time.time() * 1000))
    tree = decision_tree_algorithm(X_train)
    
    mydtc_accuracy_model.append(calculate_accuracy(X_test,tree)*100)
    time_mydtc = int(round(time.time() * 1000)) - start
    mydtc_model_time.append(time_mydtc)

    diff = [(i / j)*100 for i, j in zip(mydtc_model_time,dtc_model_time)] 
results_df = pd.DataFrame()
results_df['dtc_accuracy_model'] = dtc_accuracy_model
results_df['dtc_model_time'] = dtc_model_time
results_df['mydtc_accuracy_model'] = mydtc_accuracy_model
results_df['mydtc_model_time'] = mydtc_model_time
results_df['%diff'] = diff
results_df

Unnamed: 0,dtc_accuracy_model,dtc_model_time,mydtc_accuracy_model,mydtc_model_time,%diff
0,100.0,4,80.0,476,11900.0
1,100.0,3,80.0,421,14033.333333
2,100.0,2,76.666667,386,19300.0
3,100.0,2,73.333333,354,17700.0
4,100.0,4,86.666667,364,9100.0
5,100.0,3,83.333333,367,12233.333333
6,100.0,2,70.0,395,19750.0
7,100.0,3,66.666667,375,12500.0
8,100.0,2,86.666667,403,20150.0
9,100.0,3,82.758621,439,14633.333333


In [310]:
tree

{'time <= 67.5': [{'platelets <= 214500.0': [1.0,
    {'platelets <= 224500.0': [{'time <= 57.0': [0.0, 1.0]},
      {'age <= 66.5': [{'creatinine_phosphokinase <= 85.5': [0.0,
          {'platelets <= 307500.0': [{'time <= 24.5': [1.0,
              {'serum_sodium <= 134.5': [1.0,
                {'serum_sodium <= 142.5': [0.0, 1.0]}]}]},
            1.0]}]},
        1.0]}]}]},
  {'serum_creatinine <= 1.55': [{'ejection_fraction <= 27.5': [{'time <= 78.5': [1.0,
        {'time <= 148.0': [0.0,
          {'time <= 178.0': [1.0,
            {'time <= 210.5': [0.0,
              {'serum_creatinine <= 0.9500000000000001': [0.0, 1.0]}]}]}]}]},
      {'age <= 79.0': [{'creatinine_phosphokinase <= 2307.5': [{'serum_creatinine <= 0.6499999999999999': [{'time <= 123.0': [0.0,
              1.0]},
            {'platelets <= 349500.0': [0.0,
              {'serum_creatinine <= 1.2000000000000002': [0.0,
                {'serum_sodium <= 137.5': [1.0, 0.0]}]}]}]},
          {'diabetes <= 0.5': [0