# Decision Tree Classification using CHAID - Titanic Survival Prediction

In [1]:
import time
# record start time
start_time = time.time()

In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv(r"train.csv")

In [4]:
train.shape

(891, 12)

In [5]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train.isnull().sum().sort_values(ascending=False)

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64

In [8]:
test = pd.read_csv(r"test.csv")

In [9]:
test.shape

(418, 11)

In [10]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Data Cleaning

In [11]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [12]:
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)

In [13]:
train['Embarked'] = train['Embarked'].fillna('S')


In [14]:
train.drop('Cabin',axis=1,inplace=True)

In [15]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [16]:
train.dropna(inplace=True)

# Converting Categorical Features

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [18]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)

In [19]:
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [20]:
train = pd.concat([train,sex,embark],axis=1)

In [21]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


In [22]:
X = train.drop('Survived', axis=1)
y = train['Survived']
X = np.array(X)
y = np.array(y)

In [23]:
import numpy as np

class Node:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.feature = None
        self.threshold = None
        self.left = None
        self.right = None
        self.value = None
        
    def split(self):
        features = self.X.shape[1]
        best_feature = None
        best_threshold = None
        best_score = None
        
        for i in range(features):
            thresholds = np.unique(self.X[:, i])
            for t in thresholds:
                left_indices = np.where(self.X[:, i] <= t)[0]
                right_indices = np.where(self.X[:, i] > t)[0]
                
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue
                
                left_y = self.y[left_indices]
                right_y = self.y[right_indices]
                
                score = self.chaid_score(left_y, right_y)
                
                if best_score is None or score < best_score:
                    best_score = score
                    best_feature = i
                    best_threshold = t
        
        if best_feature is None:
            self.value = self.y.mean()
        else:
            self.feature = best_feature
            self.threshold = best_threshold
            left_indices = np.where(self.X[:, best_feature] <= best_threshold)[0]
            right_indices = np.where(self.X[:, best_feature] > best_threshold)[0]
            self.left = Node(self.X[left_indices], self.y[left_indices])
            self.right = Node(self.X[right_indices], self.y[right_indices])
        
    def chaid_score(self, left_y, right_y):
        n1 = len(left_y)
        n2 = len(right_y)
        n = n1 + n2
        
        p1 = left_y.sum() / n1
        p2 = right_y.sum() / n2
        p = (left_y.sum() + right_y.sum()) / n
        
        chaid = ((n1 * n2) / n) * (np.square(p1 - p) / p + np.square(p2 - p) / p)
        return chaid
 
#The first term is the product of the number of samples in the left and right branches divided by the total number 
#of samples, which is a correction factor to account for the size of the split. The second term is the 
#normalized Chi-squared statistic, which measures the difference between the observed and expected proportions of 
#positive target variable values in the left and right branches. 
    
    def predict(self, X):
        if self.value is not None:
            return self.value
        elif X[self.feature] <= self.threshold:
            return self.left.predict(X)
        else:
            return self.right.predict(X)

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None
        
    def fit(self, X, y):
        self.root = Node(X, y)
        self._split_node(self.root, 1)
        
    def _split_node(self, node, depth):
        if self.max_depth is not None and depth > self.max_depth:
            return
        
        node.split()
        
        if node.left is not None:
            self._split_node(node.left, depth+1)
            
        if node.right is not None:
            self._split_node(node.right, depth+1)
            
    def predict(self, X):
        return self.root.predict(X)


# Split the data into train and test sets

In [24]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state:
        np.random.seed(random_state)
    shuffle_indices = np.random.permutation(len(X))
    test_size = int(test_size * len(X))
    test_indices = shuffle_indices[:test_size]
    train_indices = shuffle_indices[test_size:]
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
classifier = DecisionTree()

In [26]:
classifier.fit(X_train,y_train)

In [27]:
y_pred=classifier.predict(y_test)

In [28]:
accuracy = (y_pred == y).mean()
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.62


In [29]:
# record end time
end_time = time.time()

# calculate the time taken to fit the model
time_taken = end_time - start_time

# print the time taken
print("Time taken to run the model: {:.2f} seconds".format(time_taken))

Time taken to run the model: 1.97 seconds
