# Implementation of ID3 decision Tree Algorithm to classify Titanic Dataset

In [None]:
# if the file fail to run please contact me (jovi.wang@team.telstra.com)
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# /kaggle/input/titanic/train.csv
# /kaggle/input/titanic/test.csv
# /kaggle/input/titanic/gender_submission.csv      

# About the dataset
Titanic - Machine Learning from Disaster

Predict survival on the Titanic


```shell
raw_train.info()
```

RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):

|    | column      | non-null count | Dtype |
| -- | ----------- | -------------- | ----- |
| 0  | PassengerId | 891 non-null   | int64 |
| 1  | Survived    | 891 non-null   | int64 |
| 2  | Pclass      | 891 non-null   | int64 |
| 3  | Name        | 891 non-null   | object |
| 4  | Sex         | 891 non-null   | object |
| 5  | Age         | 714 non-null   | float64 |
| 6  | SibSp       | 891 non-null   | int64 |
| 7  | Parch       | 891 non-null   | int64 |
| 8  | Ticket      | 891 non-null   | object |
| 9  | Fare        | 891 non-null   | float64 |
| 10  | Cabin      | 204 non-null   | object |
| 11  | Embarked   | 889 non-null   | object |

## column variable notes:

Survived: 1 = Yes, 0 = No

pclass: 1 = Upper, 2 = Middle, 3 = Lower

SibSp: # sibling & spouse

Parch: # parent & child



# Preparing dataset 
* remove columns (Name, Ticker and Cabin) that are less likely contributing to model training process 
* convert numeric data type into categorical data type
* drop missing rows
* PassengerId will be used when validating the survival result

In [None]:
def pre_process_data (data):
    
    # 1. based on the attribute description, remove some less significant columns, 
    if pd.Series(['Name','Ticket','Cabin']).isin(data.columns).all():
        data = data.drop(['Name','Ticket','Cabin'], axis=1)    
        
    # 2. convert survived column from int into string
    # later in the loop rename them into 'Yes' and 'No'
    if 'Survived' in data:
        data["Survived"] = data["Survived"].astype(str)
        
    # 3. loop through the data, rename Survived column and add new columns
    # new column #1: (Age_group) based on age range (ref: Age Categories, Life Cycle Groupings)
    # new column #2: (Fare_group) based on fare histogram (divide into 3 groups, based on result of raw_train['Fare'].describe())
    
    for index, row in data.iterrows():
        if 'Survived' in row:
            data.at[index, 'Survived'] = 'Yes' if row['Survived'] == '1' else 'No'

        if 'Age' in row:    
            if not np.isnan(row['Age']):
                if row['Age'] < 15:
                    data.at[index, 'Age_group'] = 'Children'
                elif row['Age'] < 25:
                    data.at[index, 'Age_group'] = 'Youth'
                elif row['Age'] < 65:
                    data.at[index, 'Age_group'] = 'Adults'
                else:
                    data.at[index, 'Age_group'] = 'Seniors'
    
        #     raw_train['Fare'].describe()    
        #     count    712.000000
        #     mean      34.567251
        #     std       52.938648
        #     min        0.000000
        #     25%        8.050000
        #     50%       15.645850
        #     75%       33.000000
        #     max      512.329200
        if 'Fare' in row:   
            if row['Fare'] < 8.05:
                data.at[index, 'Fare_group'] = 'Low_fare'
            elif row['Fare'] < 33:
                data.at[index, 'Fare_group'] = 'Medium_fare'
            else:
                data.at[index, 'Fare_group'] = 'High_fare'
    
    
    # 4. remove Age and Fare,since they are re-categorised into Age_group and Fare_group
    if pd.Series(['Age','Fare']).isin(data.columns).all():
        data = data.drop(['Age', 'Fare'], axis=1)    
    # remove any null values
    # raw_train - missing 177 Age, 2 Embarked values
    # raw_test - missing 86 Age values     
    data = data.dropna()
#     data = data.sort_values(by=['Pclass', 'Sex', 'Age_group', 'Fare_group', 'SibSp', 'Parch'], ascending=True)
    return data

raw_train = pd.read_csv("/kaggle/input/titanic/train.csv")
raw_test = pd.read_csv("/kaggle/input/titanic/test.csv")
raw_test_result = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
# raw_test.info()

df_train = pre_process_data(raw_train)
df_test = pre_process_data(raw_test)
df_test_result = pre_process_data(raw_test_result)

df_train

# Core function and class
* utility function to calculate entropy (**compute_entropy**) and information gain (**compute_info_gain**)
* **TreeNode** class to simulate the tree-like model of decisions (root node, leaf node, paths from root to leaf)
* functions in TreeNode class to train a decision tree model using ID3 algrithom (**fit**), predict a decision (**predict**) and print the tree model structure (**pretty_print**) 

In [None]:
def compute_entropy(y):
    """
    param y: The data samples of a discrete distribution
    """
    if len(y) < 2: #  a trivial case
        return 0
    # y.value_counts() get unique items value counts
    # nomalize = True converts the value into percentage (range from 0 to 1)
    freq = np.array(y.value_counts(normalize=True))
    # print('freq', freq)
    # the small eps for safe numerical computation 
    return -(freq * np.log2(freq + 1e-6)).sum() 

    
def compute_info_gain(samples, attr, target, silent = False):
    if not silent:
        print(f'evaluate attr: {attr}', target)
    # samples, X
    # attr, Age_group
    # target, y
    values = samples[attr].value_counts(normalize=True)
    #  print(values) -> Adults 0.594101, Youth 0.280899, Children 0.109551, Seniors 0.015449
    
    # total entropy (entropy of target)
    target_ent = compute_entropy(target)
    
    sum_sub_ent = 0
    # compute weighted sum of the subgroup entropies
    for v, freq in values.iteritems():
        # iteration 1,      iteration 2,     iteration 3,       iteration 4
        # v = Adults        v = Youth        v = Children       v = Seniors
        # freq = 0.594101   freq = 0.280899  freq = 0.109551    freq = 0.015449
        
        # get sub group indexes 
        # retieve index of samples that have Age_group == Adults
        index = samples[attr] == v
        # calculate sub group entropy
        # compute target entropy of samples that have Age_group == Adults
        sub_ent = compute_entropy(target[index])
        # accumilate the split entropy, sum weighted entropies
        sum_sub_ent += freq * sub_ent
        # print(f'subgroup:{v} freq:{freq} sub_ent:{sub_ent} sum_sub_ent:{sum_sub_ent}')
    
    # return total entropy subtract spli_entropy as infomation gain
    if not silent:
        print(f'    {attr}: target_ent: {target_ent} sum_sub_ent: {sum_sub_ent}')
        print(f'    infomation gain: {target_ent - sum_sub_ent}')
        print('')
    return target_ent - sum_sub_ent

class TreeNode:
    """
    A recursively defined data structure to store a tree.
    Each node can contain other nodes as its children
    """
    def __init__(self):
        self.children = {} # Sub nodes --
        # recursive, those elements of the same type (TreeNode)
        self.decision = None # Undecided
        self.split_feat_name = None # Splitting feature

    def pretty_print(self, prefix=''):
        if self.split_feat_name is not None:
            for k, v in self.children.items():
                v.pretty_print(f"{prefix} :When {self.split_feat_name} is {k}")
                #v.pretty_print(f"{prefix}:{k}:")
        else:
            print(f"{prefix}:{self.decision}")

    def predict(self, sample):
        # exit condition for recursion 
        if self.decision is not None:
            print("Decision:", self.decision)
            return self.decision
        else: 
            attr_val = sample[self.split_feat_name]
            if attr_val in self.children:
                
                child = self.children[attr_val]
                print("Testing ", self.split_feat_name, "->", attr_val)

                # call it self to start the recursion
                return child.predict(sample)
            else:
                # when attr_val is not a key for self.children, this means the tree model does not know this data before
                return 'Unknown'

    def fit(self, X, y):
        """
        The function accepts a training dataset, from which it builds the tree 
        structure to make decisions or to make children nodes (tree branches) 
        to do further inquiries
        :param X: [n * p] n observed data samples of p attributes
        :param y: [n] target values
        """
        if len(X) == 0:
            # If the data is empty when this node is arrived, 
            # we just make an arbitrary decision
            self.decision = "Yes"
            return
        else: 
            # if remaining data all have same target value, then decision is the target value 
            if len(y.unique()) == 1:
                print(f"all target values are unique, make decision!\n")
                self.decision = y.unique()[0]
                return
            else:
                info_gain_max = 0
                # Examine each attribute and find the biggest infomation gain
                for a in X.keys():
                    aig = compute_info_gain(X, a, y, silent = True)
                    # aig = compute_info_gain(X, a, y)
                    if aig > info_gain_max:
                        info_gain_max = aig
                        self.split_feat_name = a
                # when information gain is 0 for all selected attribute 
                # thus, info_gain_max = 0 and self.split_feat_name is None, self.decision will be simple majority in y
                if info_gain_max == 0 and self.split_feat_name == None:
                    # print(X, y)
                    print(f"after evaluate each attribute, split_feat_name is None and IG_max is 0, make decision!\n")
                    self.decision = y.value_counts().index.tolist()[0];
                    return
                
                print(f"Split by {self.split_feat_name}, IG: {info_gain_max:.6f}")
                print()
                self.children = {}
                for v in X[self.split_feat_name].unique():
                    print(f'loop through feature {self.split_feat_name} unique values')
                    index = X[self.split_feat_name] == v
                    self.children[v] = TreeNode()
                    self.children[v].fit(X[index], y[index])

# Build tree models and evaluate performance

## Tree_1 build 

In [None]:
# Test tree building with all attributes
attrs_1 = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Age_group', 'Fare_group']
data_1 = df_train[attrs_1]
target = df_train["Survived"]

tree_1 = TreeNode()
tree_1.fit(data_1, target)
# tree_1.pretty_print(prefix = 'tree_1')
# end up getting a big tree with many branches

## Tree_2 build

In [None]:
# Test tree building with less attributes (skip SibSp and Parch)
attrs_2 = ['Pclass', 'Sex', 'Embarked', 'Age_group', 'Fare_group']
data_2 = df_train[attrs_2]
target = df_train["Survived"]

tree_2 = TreeNode()
tree_2.fit(data_2, target)
# tree_2.pretty_print(prefix='tree_2')

In [None]:
# Test tree building with less attributes (skip SibSp and Parch)
attrs_3 = ['Pclass', 'Sex','Age_group', 'Fare_group']
data_3 = df_train[attrs_3]
target = df_train["Survived"]

tree_3 = TreeNode()
tree_3.fit(data_3, target)
# tree_3.pretty_print(prefix='tree_3')

# Evaluate model performance using test dataset

In [None]:
count_1 = 0
count_2 = 0
count_3 = 0

for i, row in df_test.iterrows():
    pred_value_1 = tree_1.predict(row)
    pred_value_2 = tree_2.predict(row)
    pred_value_3 = tree_3.predict(row)
    # pred_value is Yes, No and Unknown

    passengerId = row['PassengerId']
    test_result_value = df_test_result.loc[df_test_result['PassengerId'] == passengerId]['Survived'];
        
    if (test_result_value == pred_value_1).all():
        count_1 += 1
    if (test_result_value == pred_value_2).all():
        count_2 += 1
    if (test_result_value == pred_value_3).all():
        count_3 += 1
print()    
print(f'accurate rate for t1 is: {count_1/len(df_test)}')
print(f'accurate rate for t2 is: {count_2/len(df_test)}')
print(f'accurate rate for t3 is: {count_3/len(df_test)}')
# one result is like below
# accurate rate for t1 is: 0.7349397590361446
# accurate rate for t2 is: 0.7530120481927711
# accurate rate for t3 is: 0.8734939759036144