In [81]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import accuracy_score
from collections import Counter

#Decision Tree

In [82]:
# Decision Tree Implementation
class DecisionTree:
    def __init__(self, criterion='gini', max_depth=5, min_samples_split=2, min_samples_leaf=1):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def fit(self, X, y, sample_weight=None):
        if sample_weight is None:
            sample_weight = np.ones(len(y)) / len(y)
        self.tree = self._grow_tree(X, y, sample_weight, depth=0)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _gini(self, y, sample_weight):
        m = np.sum(sample_weight)
        return 1.0 - sum((np.sum(sample_weight[y == c]) / m) ** 2 for c in np.unique(y))

    def _entropy(self, y, sample_weight):
        m = np.sum(sample_weight)
        probs = np.array([np.sum(sample_weight[y == c]) / m for c in np.unique(y)])
        return -np.sum([p * np.log2(p) for p in probs if p > 0])

    def _information_gain(self, y, y_left, y_right, weight_left, weight_right):
        if self.criterion == 'gini':
            impurity = self._gini
        else:
            impurity = self._entropy
        total_weight = np.sum(weight_left) + np.sum(weight_right)
        return impurity(y, np.concatenate([weight_left, weight_right])) - (
            (np.sum(weight_left) / total_weight) * impurity(y_left, weight_left)
            + (np.sum(weight_right) / total_weight) * impurity(y_right, weight_right)
        )

    def _best_split(self, X, y, sample_weight):
        best_gain, best_split = 0, None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask
                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue
                y_left, y_right = y[left_mask], y[right_mask]
                weight_left, weight_right = sample_weight[left_mask], sample_weight[right_mask]
                gain = self._information_gain(y, y_left, y_right, weight_left, weight_right)
                if gain > best_gain:
                    best_gain, best_split = gain, (feature, threshold)
        return best_split

    def _grow_tree(self, X, y, sample_weight, depth):
        if len(set(y)) == 1 or depth == self.max_depth or len(y) < self.min_samples_split:
            return Counter(y).most_common(1)[0][0]
        split = self._best_split(X, y, sample_weight)
        if split is None:
            return Counter(y).most_common(1)[0][0]
        feature, threshold = split
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask
        return {
            'feature': feature,
            'threshold': threshold,
            'left': self._grow_tree(X[left_mask], y[left_mask], sample_weight[left_mask], depth + 1),
            'right': self._grow_tree(X[right_mask], y[right_mask], sample_weight[right_mask], depth + 1)
        }

    def _traverse_tree(self, x, node):
        if isinstance(node, dict):
            return self._traverse_tree(x, node['left'] if x[node['feature']] <= node['threshold'] else node['right'])
        return node


#Random Forest

In [83]:
# Random Forest Implementation
class RandomForest:
    def __init__(self, classifier, num_trees=10, min_features=2):
        self.num_trees = num_trees
        self.min_features = min_features
        self.trees = [classifier() for _ in range(num_trees)]

    def fit(self, X, y):
        self.feature_indices = []
        for tree in self.trees:
            sample_indices = np.random.choice(len(X), len(X), replace=True)
            num_features = np.random.randint(self.min_features, X.shape[1] + 1)
            features = np.random.choice(range(X.shape[1]), num_features, replace=False)
            self.feature_indices.append(features)
            tree.fit(X[sample_indices][:, features], y[sample_indices])

    def predict(self, X):
        predictions = np.array([tree.predict(X[:, features]) for tree, features in zip(self.trees, self.feature_indices)])
        return np.array([Counter(predictions[:, i]).most_common(1)[0][0] for i in range(X.shape[0])])

#AdaBoost

In [84]:
# AdaBoost Implementation
class AdaBoost:
    def __init__(self, weak_learner, num_learners=50, learning_rate=1):
        self.num_learners = num_learners
        self.learning_rate = learning_rate
        self.learners = []
        self.alphas = []
        self.weak_learner = weak_learner

    def fit(self, X, y):
        n = len(y)
        weights = np.ones(n) / n
        for _ in range(self.num_learners):
            learner = self.weak_learner(max_depth=1)
            learner.fit(X, y, sample_weight=weights)
            predictions = learner.predict(X)
            err = np.sum(weights * (predictions != y)) / np.sum(weights)
            if err >= 0.5:
                break
            if err == 0:
                alpha = 1
            else:
                alpha = self.learning_rate * np.log((1 - err) / (err + 1e-10))
            weights *= np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)
            self.learners.append(learner)
            self.alphas.append(alpha)

    def predict(self, X):
        weak_preds = np.array([alpha * learner.predict(X) for alpha, learner in zip(self.alphas, self.learners)])
        return np.sign(np.sum(weak_preds, axis=0))


##Import Titanic dataset

In [85]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

# Store our test passenger IDs for easy access
PassengerId = test['PassengerId']

# Showing overview of the train dataset
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


##Preprocessing

In [86]:
original_train = train.copy()

# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings
full_data = [train, test]

# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column
for dataset in full_data:
      dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

# Remove all NULLS in the Age column
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    # Next line has been improved to avoid warning
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] ;

In [87]:
# Feature selection: remove variables no longer containing relevant information
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)

In [88]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,0,0,0,0,2,0,1
1,1,1,0,2,0,3,1,1,2,0,3
2,1,3,0,1,0,1,0,0,1,1,4
3,1,1,0,2,0,3,0,1,2,0,3
4,0,3,1,2,0,1,0,0,1,1,1


In [89]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,3,1,2,0,0,2,0,1,1,1
1,3,0,2,0,0,0,0,2,0,3
2,2,1,3,0,1,2,0,1,1,1
3,3,1,1,0,1,0,0,1,1,1
4,3,0,1,1,1,0,0,3,0,3


In [90]:
accuracy_results = {}

# Split data
X = train.drop(columns=["Survived"]).values
y = train["Survived"].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Decision Tree": DecisionTree(max_depth=5, min_samples_split=4, min_samples_leaf=2),
    "Random Forest": RandomForest(classifier=DecisionTree, num_trees=10, min_features=2),
    "AdaBoost": AdaBoost(weak_learner=DecisionTree, num_learners=10, learning_rate=0.1)
}

# Train models and printing accuracies
for name, model in models.items():
    model.fit(X_train, y_train)
    train_accuracy = np.mean(model.predict(X_train) == y_train)
    val_accuracy = np.mean(model.predict(X_val) == y_val)

    # Storing accuracy results for table
    accuracy_results[name] = {'Train Accuracy': train_accuracy, 'Validation Accuracy': val_accuracy}

    print('--------------------------------------------------------')
    print(f'{name} Training Accuracy: {train_accuracy:.4f}')
    print(f'{name} Validation Accuracy: {val_accuracy:.4f}')
print('--------------------------------------------------------')

--------------------------------------------------------
Decision Tree Training Accuracy: 0.8427
Decision Tree Validation Accuracy: 0.8212
--------------------------------------------------------
Random Forest Training Accuracy: 0.8553
Random Forest Validation Accuracy: 0.8045
--------------------------------------------------------
AdaBoost Training Accuracy: 0.7823
AdaBoost Validation Accuracy: 0.7821
--------------------------------------------------------


In [91]:
# Impurity metrics for Decision Tree
print("------------------------------------------------------------------------------------------------------")
print("Gini Impurity, Entropy, and Information Gain:")
print("Using Gini as it gives better results, but showing Entropy and information gain for reference\n")

sample_weight = np.ones(len(y_train)) / len(y_train)  # Equal weights
first_tree = models["Decision Tree"]  # Use Decision Tree model for calculation
split = first_tree._best_split(X_train, y_train, sample_weight)

if split is not None:
    feature, threshold = split
    left_mask = X_train[:, feature] <= threshold
    y_left, y_right = y_train[left_mask], y_train[~left_mask]
    weight_left, weight_right = sample_weight[left_mask], sample_weight[~left_mask]

    gini_impurity = first_tree._gini(y_train, sample_weight)
    entropy_impurity = first_tree._entropy(y_train, sample_weight)
    information_gain = first_tree._information_gain(y_train, y_left, y_right, weight_left, weight_right)

    print(f"  Gini Impurity: {gini_impurity:.4f}")
    print(f"  Entropy Impurity: {entropy_impurity:.4f}")
    print(f"  Information Gain: {information_gain:.4f}\n")
else:
    print("No valid split found for impurity calculations.\n")
print("------------------------------------------------------------------------------------------------------")

------------------------------------------------------------------------------------------------------
Gini Impurity, Entropy, and Information Gain:
Using Gini as it gives better results, but showing Entropy and information gain for reference

  Gini Impurity: 0.4694
  Entropy Impurity: 0.9555
  Information Gain: 0.1400

------------------------------------------------------------------------------------------------------


##Test Data Predictions

Importing the actual values from gender_submission.csv file in-order to compare our predictions

In [92]:
# Predict on Test Data
at = pd.read_csv('/content/gender_submission.csv')
actual = at["Survived"].values

In [93]:
f_test = test.to_numpy()
test_predictions = {}
for name, model in models.items():
    y_test_pred = model.predict(f_test)  # Predict on test data
    test_accuracy = np.mean(y_test_pred == actual)  # Check the accuracy based on actual data
    print(f'{name} Test Accuracy: {test_accuracy:.4f}')
    accuracy_results[name]['Test Accuracy'] = test_accuracy
    test_predictions[name] = y_test_pred

Decision Tree Test Accuracy: 0.8565
Random Forest Test Accuracy: 0.9234
AdaBoost Test Accuracy: 0.9378


In [94]:
# creating dataframe to side by side compare
for name, prediction in test_predictions.items():
    print('-----------------------')
    print(f'{name}:')
    print('-----------------------')
    df = pd.DataFrame({'Actual': actual, 'Predicted': prediction})
    print(df.head())  # check first 5 values
    print('-----------------------')
    print('\n')

-----------------------
Decision Tree:
-----------------------
   Actual  Predicted
0       0          0
1       1          0
2       0          0
3       0          0
4       1          0
-----------------------


-----------------------
Random Forest:
-----------------------
   Actual  Predicted
0       0          0
1       1          1
2       0          0
3       0          0
4       1          1
-----------------------


-----------------------
AdaBoost:
-----------------------
   Actual  Predicted
0       0        0.0
1       1        1.0
2       0        0.0
3       0        0.0
4       1        1.0
-----------------------




##Tabular comparision

In [95]:
from tabulate import tabulate

table_data = [[model, acc.get('Train Accuracy', 'N/A'), acc.get('Validation Accuracy', 'N/A'), acc.get('Test Accuracy', 'N/A')]
              for model, acc in accuracy_results.items()]
headers = ["Model", "Train Accuracy", "Validation Accuracy", "Test Accuracy"]
print(tabulate(table_data, headers=headers, tablefmt="grid"))


+---------------+------------------+-----------------------+-----------------+
| Model         |   Train Accuracy |   Validation Accuracy |   Test Accuracy |
| Decision Tree |         0.842697 |              0.821229 |        0.856459 |
+---------------+------------------+-----------------------+-----------------+
| Random Forest |         0.855337 |              0.804469 |        0.923445 |
+---------------+------------------+-----------------------+-----------------+
| AdaBoost      |         0.782303 |              0.782123 |        0.937799 |
+---------------+------------------+-----------------------+-----------------+


Based on optimal hyper-parameters Ada-boost gives best prediction. Whereas Random-forest performs best in the training and validation.