Import Libraries

In [28]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from pprint import pprint
from sklearn.metrics import confusion_matrix

Functions

In [2]:
def preprocess(data, encoder):
  if "Cabin" in data:
    data = data.drop(['Cabin'], axis=1)
  if "Fare" in data:
    data = data.drop(['Fare'], axis=1)
  if "Embarked" in data:
    data = data.drop(['Embarked'], axis=1)
  data['Age'] = data['Age'].fillna(data['Age'].median())
  data['Sex'] = encoder.fit_transform(data['Sex'])
  data = pd.get_dummies(data)
  return data

In [3]:
def Read_Data():
  train = pd.read_csv('titanic-train.csv')
  test = pd.read_csv('titanic-test.csv')
  return train, test

In [4]:
def Entropy(s):
    entropy = 0
    value, counts = np.unique(s, return_counts=True)
    Median = counts.astype('float') / len(s)
    for median in Median:
        if median != 0.0:
            entropy -= median * np.log2(median)
    return entropy

In [5]:
def InfoGain(data, split_attribute_name, target_name="Survived"):
    total_entropy = Entropy(data[target_name])
    vals, counts= np.unique(data[split_attribute_name],return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*Entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [20]:
def ID3(data, originaldata, features, max_depth, i, target_attribute_name="Survived", parent_node_class = None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]  
    elif len(features) ==0:
        return parent_node_class
    else:
      if i != max_depth:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature:{}}
        features = [i for i in features if i != best_feature]
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = ID3(sub_data,data,features,max_depth, i+1,target_attribute_name,parent_node_class)
            tree[best_feature][value] = subtree
        return(tree)

In [35]:
def predict(query, tree, default = 1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result

In [37]:
def test_(data,tree):
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predicted = pd.DataFrame(columns=["predicted"]) 
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i], tree, 1.0) 
    return (np.sum(predicted["predicted"] == data["Survived"])/len(data))*100

Read Data as Panda dataframe

In [9]:
train, test = Read_Data()

Preprocess Data

In [10]:
encoder = preprocessing.LabelEncoder()
train = preprocess(train, encoder)
test = preprocess(test, encoder)

In [11]:
new_cols = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Survived"]
train = train[new_cols]

In [12]:
train_data = train.iloc[:712,:].reset_index(drop=True)
test_data = train.iloc[712:,:].reset_index(drop=True)

Desicion Tree

In [40]:
Depth = [3, 4, 5]
for i in Depth:
  tree = ID3(train_data,train_data,train_data.columns[:-1], i, 0)
  pprint(tree)
  print(f'Accuracy with Depth {i} is: {test_(test_data, tree)}%')

{'Sex': {0: {'Age': {0.75: 1.0,
                     1.0: 1.0,
                     2.0: {'Parch': {1.0: None, 2.0: 0.0}},
                     3.0: {'Pclass': {2.0: 1.0, 3.0: 0.0}},
                     4.0: 1.0,
                     5.0: 1.0,
                     7.0: 1.0,
                     8.0: {'Pclass': {2.0: 1.0, 3.0: 0.0}},
                     9.0: 0.0,
                     10.0: 0.0,
                     11.0: 0.0,
                     13.0: 1.0,
                     14.0: {'SibSp': {0.0: 0.0, 1.0: 1.0}},
                     14.5: 0.0,
                     15.0: 1.0,
                     16.0: {'SibSp': {0.0: 1.0, 5.0: 0.0}},
                     17.0: {'Pclass': {1.0: 1.0, 2.0: 1.0, 3.0: None}},
                     18.0: {'Pclass': {1.0: 1.0, 2.0: 1.0, 3.0: None}},
                     19.0: 1.0,
                     20.0: 0.0,
                     21.0: {'Pclass': {1.0: 1.0, 2.0: 1.0, 3.0: None}},
                     22.0: {'Pclass': {1.0: 1.0, 2.0: 1.0, 3.0: None}},
 