In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import random
from pprint import pprint
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/decision-treesrandom-forests-2020/test.csv
/kaggle/input/decision-treesrandom-forests-2020/sample_submission.csv
/kaggle/input/decision-treesrandom-forests-2020/train.csv


In [2]:
df = pd.read_csv(r"/kaggle/input/decision-treesrandom-forests-2020/train.csv")
train_data_df = pd.read_csv(r"/kaggle/input/decision-treesrandom-forests-2020/train.csv")

In [3]:
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,17363,1,172,89.0,110,70,1,1,0,1,0,0
1,21200,1,168,68.0,110,70,1,1,0,0,1,0
2,17286,1,167,74.0,120,80,1,1,0,0,1,0
3,15822,1,157,61.0,90,70,1,1,0,0,1,0
4,20458,1,156,55.0,110,70,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,23279,1,170,72.0,120,80,1,1,0,0,0,0
3996,20450,1,165,58.0,120,80,1,1,0,0,1,1
3997,22085,2,170,71.0,120,90,1,1,0,0,1,1
3998,23212,1,164,73.0,130,90,1,1,0,0,1,1


In [4]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [5]:
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [6]:
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

In [7]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):          # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits

In [8]:
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    
    # feature is categorical   
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    
    return data_below, data_above

In [9]:
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [10]:
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

In [11]:
def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

In [12]:
def determine_type_of_feature(df):
    
    feature_types = []
    n_unique_values_treshold = 15
    for feature in df.columns:
        if feature != "label":
            unique_values = df[feature].unique()
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types

In [13]:
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5):
    
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # check for empty data
        if len(data_below) == 0 or len(data_above) == 0:
            classification = classify_data(data)
            return classification
        
        # determine question
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
            
        # feature is categorical
        else:
            question = "{} = {}".format(feature_name, split_value)
        
        # instantiate sub-tree
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [14]:
def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if comparison_operator == "<=":  # feature is continuous
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [15]:
def find_test(df_test, tree):

    df_test["cardio"] = df_test.apply(classify_example, axis=1, args=(tree,))

    return df_test

In [16]:
df_test = pd.read_csv(r"/kaggle/input/decision-treesrandom-forests-2020/test.csv")
df_test

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,16036,1,165,125.0,160,100,1,2,0,0,0
1,1,17004,1,165,120.0,100,75,2,1,0,0,1
2,2,17547,1,151,58.0,110,60,2,1,0,0,1
3,3,20562,1,172,70.0,130,90,1,1,0,0,1
4,4,18759,1,168,64.0,110,80,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,19542,2,165,68.0,150,90,1,1,0,0,1
996,996,21153,1,148,79.0,140,80,2,2,0,0,1
997,997,19673,1,170,82.0,120,80,1,1,0,0,0
998,998,19662,2,170,67.0,90,60,1,1,0,0,0


In [17]:
random.seed(0)

train_df, test_df = train_test_split(df, 0.3)
tree = decision_tree_algorithm(train_df,min_samples=50, max_depth=8)
df_test = find_test(df_test, tree)

pprint (tree)

{'ap_hi <= 120.0': [{'age <= 19560.0': [{'cholesterol = 1.0': [{'age <= 15994.0': [0.0,
                                                                                   {'weight <= 98.0': [{'age <= 16034.0': [1.0,
                                                                                                                           {'ap_hi <= 100.0': [{'ap_hi <= 80.0': [1.0,
                                                                                                                                                                  0.0]},
                                                                                                                                               0.0]}]},
                                                                                                       1.0]}]},
                                                               {'weight <= 56.0': [0.0,
                                                                                   {'ap_lo <= 60

In [18]:
df_test['cardio'] = df_test['cardio'].astype(int)
df_test

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,16036,1,165,125.0,160,100,1,2,0,0,0,1
1,1,17004,1,165,120.0,100,75,2,1,0,0,1,1
2,2,17547,1,151,58.0,110,60,2,1,0,0,1,0
3,3,20562,1,172,70.0,130,90,1,1,0,0,1,1
4,4,18759,1,168,64.0,110,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,19542,2,165,68.0,150,90,1,1,0,0,1,1
996,996,21153,1,148,79.0,140,80,2,2,0,0,1,1
997,997,19673,1,170,82.0,120,80,1,1,0,0,0,0
998,998,19662,2,170,67.0,90,60,1,1,0,0,0,0


In [19]:
output = pd.DataFrame({'id': df_test.id, 'cardio': df_test.cardio})
output.to_csv('sample_submission.csv', index=False)