In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:

from DecisionTree import DecisionTree
from RandomForest import RandomForest
from AdaBoost import AdaBoost


## Cleaning Titanic dataset

 Copied from kaggle: https://www.kaggle.com/code/dmilla/introduction-to-decision-trees-titanic-dataset

In [3]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")


original_train = train.copy()

# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings
full_data = [train, test]

# Feature that tells whether a passenger had a cabin on the Titanic
train["Has_Cabin"] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test["Has_Cabin"] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset["IsAlone"] = 0
    dataset.loc[dataset["FamilySize"] == 1, "IsAlone"] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset["Embarked"] = dataset["Embarked"].fillna("S")
# Remove all NULLS in the Fare column
for dataset in full_data:
    dataset["Fare"] = dataset["Fare"].fillna(train["Fare"].median())

# Remove all NULLS in the Age column
for dataset in full_data:
    age_avg = dataset["Age"].mean()
    age_std = dataset["Age"].std()
    age_null_count = dataset["Age"].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    # Next line has been improved to avoid warning
    dataset.loc[np.isnan(dataset["Age"]), "Age"] = age_null_random_list
    dataset["Age"] = dataset["Age"].astype(int)


# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(r" ([A-Za-z]+)\.", name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""


for dataset in full_data:
    dataset["Title"] = dataset["Name"].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset["Title"] = dataset["Title"].replace(
        ["Lady", "Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"], "Rare"
    )

    dataset["Title"] = dataset["Title"].replace("Mlle", "Miss")
    dataset["Title"] = dataset["Title"].replace("Ms", "Miss")
    dataset["Title"] = dataset["Title"].replace("Mme", "Mrs")

for dataset in full_data:
    # Mapping Sex
    dataset["Sex"] = dataset["Sex"].map({"female": 0, "male": 1}).astype(int)

    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset["Title"] = dataset["Title"].map(title_mapping)
    dataset["Title"] = dataset["Title"].fillna(0)

    # Mapping Embarked
    dataset["Embarked"] = dataset["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int)

    # Mapping Fare
    dataset.loc[dataset["Fare"] <= 7.91, "Fare"] = 0
    dataset.loc[(dataset["Fare"] > 7.91) & (dataset["Fare"] <= 14.454), "Fare"] = 1
    dataset.loc[(dataset["Fare"] > 14.454) & (dataset["Fare"] <= 31), "Fare"] = 2
    dataset.loc[dataset["Fare"] > 31, "Fare"] = 3
    dataset["Fare"] = dataset["Fare"].astype(int)

    # Mapping Age
    dataset.loc[dataset["Age"] <= 16, "Age"] = 0
    dataset.loc[(dataset["Age"] > 16) & (dataset["Age"] <= 32), "Age"] = 1
    dataset.loc[(dataset["Age"] > 32) & (dataset["Age"] <= 48), "Age"] = 2
    dataset.loc[(dataset["Age"] > 48) & (dataset["Age"] <= 64), "Age"] = 3
    dataset.loc[dataset["Age"] > 64, "Age"]


# Feature selection: remove variables no longer containing relevant information
drop_elements = ["PassengerId", "Name", "Ticket", "Cabin", "SibSp"]
train = train.drop(drop_elements, axis=1)
test = test.drop(drop_elements, axis=1)




In [4]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,0,0,0,0,2,0,1
1,1,1,0,2,0,3,1,1,2,0,3
2,1,3,0,1,0,1,0,0,1,1,4
3,1,1,0,2,0,3,0,1,2,0,3
4,0,3,1,2,0,1,0,0,1,1,1


### Creating train and test sets

In [5]:
train_np = train.copy()
test_np = test.copy()

test_np = test_np.to_numpy()

X = train_np.drop("Survived", axis=1).to_numpy()
y = train_np["Survived"].to_numpy()

In [6]:
print(type(X), type(y), type(test_np))

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Decision Trees

### Create a decision tree model

In [32]:

dt = DecisionTree(max_depth=3, criterion="gini", min_samples_split=3, min_samples_leaf=2)

### Fit the model

In [33]:
dt.fit(X_train, y_train)

### Predict 

In [34]:
y_pred_dt = dt.predict(X_test)


In [11]:
# y_pred_dt

In [12]:
# print(y_test)

### Accuracy on test set

In [35]:
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100

In [36]:
print(f'Accuracy on test set: {accuracy_dt:.3f}%')

Accuracy on test set: 84.444%


## Random Forest

### Create random forest model

In [55]:
rf = RandomForest(DecisionTree, num_trees=15, min_features=5)

### Fit model

In [56]:
rf.fit(X_train, y_train)



Selected feature indices [4 2 1 5 0 6 7]
Selected feature indices [0 2 3 1 7]
Selected feature indices [2 3 4 8 6 0 5]
Selected feature indices [9 8 7 2 0 3]
Selected feature indices [9 3 1 6 0 7]
Selected feature indices [7 0 6 3 1]
Selected feature indices [8 6 2 1 4 5 3 9]
Selected feature indices [2 8 5 6 3 9 7]
Selected feature indices [5 4 3 2 6 9 1]
Selected feature indices [4 6 1 9 5 7 3 0 2 8]
Selected feature indices [1 3 2 7 4 8 6 5 0]
Selected feature indices [4 3 8 5 0 9 2 7]
Selected feature indices [1 4 8 3 7]
Selected feature indices [3 4 0 2 7 5 8 1 6 9]
Selected feature indices [3 8 4 6 7]


### Predict

In [57]:
y_pred_rf = rf.predict(X_test)

### Accuracy on test set

In [58]:
accuracy_rf = accuracy_score(y_test, y_pred_rf) * 100

In [59]:
print(f"Accuracy on test set: {accuracy_rf:.3f}%")

Accuracy on test set: 85.556%


## Adaboost

### Convert y labels to {-1,1}

In [20]:
y_train_ada = y_train.copy()
y_test_ada = y_test.copy()

In [21]:
y_train_ada = y = np.where(y_train_ada == 0, -1, 1)
y_test_ada = y = np.where(y_test_ada == 0, -1, 1)

In [22]:
# y_train_ada

### Create adaboost model

In [23]:

ab = AdaBoost(DecisionTree, num_learners=55, learning_rate=1)
ab.fit(X_train, y_train_ada)

### Predict

In [24]:
y_pred_ab = ab.predict(X_test)

In [25]:
# print(y_pred_ab)

In [26]:
# y_test_ada

### Accuracy on test set

In [27]:
accuracy_ada = accuracy_score(y_test_ada, y_pred_ab) * 100

In [28]:
print(f"Accuracy on test set: {accuracy_ada:.3f}%")

Accuracy on test set: 86.667%


## Results

In [60]:
scores = pd.DataFrame(
    {"Models": ["Decision Tree", "Random Forest", "AdaBoost"], 
     "Test Accuracy": [accuracy_dt, accuracy_rf, accuracy_ada]}
)

In [61]:
scores["Test Accuracy"] = scores["Test Accuracy"].apply(lambda x: round(x, 3))

In [62]:
scores

Unnamed: 0,Models,Test Accuracy
0,Decision Tree,84.444
1,Random Forest,85.556
2,AdaBoost,86.667
