In [3]:
#DECISIONTREE

import numpy as np

class DecisionTree:
    def __init__(self, criterion=None, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def fit(self, X, y, sample_weight=None):
        self.tree = self._build_tree(X, y, depth=0)

    def predict(self, X):
        if self.tree is None:
            raise Exception("The tree has not been trained yet. Please fit the model first.")
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape

        if depth >= self.max_depth or num_samples < self.min_samples_split or len(np.unique(y)) == 1:
            return {'leaf': True, 'prediction': self._calculate_prediction(y)}

        if self.criterion == 'gini':
            best_split = self._get_best_split_gini(X, y)
        elif self.criterion == 'entropy':
            best_split = self._get_best_split_entropy(X, y)
        elif self.criterion == 'misclassification':
            best_split = self._get_best_split_misclassification(X, y)
        else:
            raise ValueError("Invalid criterion. Supported values are 'gini', 'entropy', and 'misclassification'.")

        if best_split['left_X'].shape[0] < self.min_samples_leaf or best_split['right_X'].shape[0] < self.min_samples_leaf:
            return {'leaf': True, 'prediction': self._calculate_prediction(y)}

        left_tree = self._build_tree(best_split['left_X'], best_split['left_y'], depth + 1)
        right_tree = self._build_tree(best_split['right_X'], best_split['right_y'], depth + 1)

        return {'leaf': False, 'split_feature': best_split['split_feature'], 'split_value': best_split['split_value'],
                'left_tree': left_tree, 'right_tree': right_tree}

    def _get_best_split_gini(self, X, y):
        num_samples, num_features = X.shape
        best_gini = float('inf')
        best_split = None


        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                left_mask = X[:, feature] <= value
        
                right_mask = ~left_mask

                left_y = y[left_mask]
                right_y = y[right_mask]

                gini = (len(left_y) / num_samples) * self._calculate_gini(left_y) + \
                       (len(right_y) / num_samples) * self._calculate_gini(right_y)

                if gini < best_gini:
                    best_gini = gini
                    best_split = {'split_feature': feature, 'split_value': value,
                                  'left_X': X[left_mask], 'left_y': left_y,
                                  'right_X': X[right_mask], 'right_y': right_y}
        return best_split

    def _get_best_split_entropy(self, X, y):
        num_samples, num_features = X.shape
        best_entropy = float('inf')
        best_split = None

        
                
        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                left_mask = X[:, feature] <= value 
    
                right_mask = ~left_mask

                left_y = y[left_mask]
                right_y = y[right_mask]

                entropy = (len(left_y) / num_samples) * self._calculate_entropy(left_y) + \
                          (len(right_y) / num_samples) * self._calculate_entropy(right_y)

                if entropy < best_entropy:
                    best_entropy = entropy
                    best_split = {'split_feature': feature, 'split_value': value,
                                  'left_X': X[left_mask], 'left_y': left_y,
                                  'right_X': X[right_mask], 'right_y': right_y}
        return best_split

    def _get_best_split_misclassification(self, X, y):
        num_samples, num_features = X.shape
        best_misclassification = float('inf')
        best_split = None
        
        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                left_mask = X[:, feature] <= value 

                right_mask = ~left_mask

                left_y = y[left_mask]
                right_y = y[right_mask]

                misclassification = (len(left_y) / num_samples) * self._calculate_misclassification(left_y) + \
                                    (len(right_y) / num_samples) * self._calculate_misclassification(right_y)

                if misclassification < best_misclassification:
                    best_misclassification = misclassification
                    best_split = {'split_feature': feature, 'split_value': value,
                                  'left_X': X[left_mask], 'left_y': left_y,
                                  'right_X': X[right_mask], 'right_y': right_y}
        return best_split

    def _calculate_gini(self, y):
        if len(y) == 0:
            return 0
        p = np.bincount(y) / len(y)
        return 1 - np.sum(p ** 2)

    def _calculate_entropy(self, y):
        if len(y) == 0:
            return 0
    
        p = np.bincount(y) / len(y)
        p = p[p > 0]  # Exclude zero probabilities to avoid divide by zero and log(0) issues
        return -np.sum(p * np.log2(p))

    def _calculate_misclassification(self, y):
        if len(y) == 0:
            return 0
        p = np.bincount(y) / len(y)
        return 1 - np.max(p)

    def _calculate_prediction(self, y):
        # Calculate the class prediction based on the most frequent class in y
        return np.argmax(np.bincount(y))

    def _traverse_tree(self, x, node):
        if node['leaf']:
            return node['prediction']

    
        if x[node['split_feature']] <= node['split_value']:
            return self._traverse_tree(x, node['left_tree'])
        else:
            return self._traverse_tree(x, node['right_tree'])


In [4]:
#RANDOM FOREST

import numpy as np

class RandomForest:
    def __init__(self, classifier, num_trees, min_features):
        self.classifier = classifier
        self.num_trees = num_trees
        self.min_features = min_features
        self.forest = []

    def fit(self, X, y):
        self.forest = []

        for _ in range(self.num_trees):
            # Sample with replacement
            sampled_indices = np.random.choice(len(X), len(X), replace=True)
            X_sampled = X[sampled_indices]
            y_sampled = y[sampled_indices]

            # Select random subset of features
            selected_features = np.random.choice(X.shape[1], self.min_features, replace=False)

            # Use the provided DecisionTree instance directly
            tree = self.classifier
            tree.fit(X_sampled[:, selected_features], y_sampled)

            self.forest.append({'tree': tree, 'selected_features': selected_features})

    def predict(self, X):
        if not self.forest:
            raise Exception("The random forest has not been trained yet. Please fit the model first.")

        predictions = []

        for tree_info in self.forest:
            tree = tree_info['tree']
            selected_features = tree_info['selected_features']
            prediction = tree.predict(X[:, selected_features])
            predictions.append(prediction)

        return np.array([np.argmax(np.bincount(p)) for p in np.array(predictions).T])
    
 

In [5]:
#BOOSTING

import numpy as np

class AdaBoost:
    def __init__(self, weak_learner, num_learners, learning_rate):
        self.weak_learner = weak_learner
        self.num_learners = num_learners
        self.learning_rate = learning_rate
        self.learners = []
        self.alphas = []

    def fit(self, X, y):
        num_samples = X.shape[0]
        weights = np.ones(num_samples) / num_samples 

        for m in range(self.num_learners):
            learner = self.weak_learner(criterion='gini', max_depth=4)  
            learner.fit(X, y, sample_weight=weights)

            predictions = learner.predict(X)
            incorrect = predictions != y
            error = np.sum(weights[incorrect])

            if error == 0:
                self.learners.append(learner)
                self.alphas.append(1.0)
                break

            alpha = 0.5 * np.log((1 - error) / error)
            self.learners.append(learner)
            self.alphas.append(alpha)

            weights *= np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)

    def predict(self, X):
        if not self.learners or not self.alphas:
            raise Exception("error.")

        learner_predictions = np.array([learner.predict(X) for learner in self.learners])
        weighted_sum = np.dot(self.alphas, learner_predictions)
        return np.sign(weighted_sum).astype(int)


In [12]:
#TITANIC DATASET
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import re

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

PassengerId = test['PassengerId']


original_train = train.copy() # Using 'copy()' allows to clone the dataset, creating a different object with the same values

# Feature engineering steps taken from Sina and Anisotropic, with minor changes to avoid warnings
full_data = [train, test]

# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

# Remove all NULLS in the Age column
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    # Next line has been improved to avoid warning
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] ;
# Feature selection: remove variables no longer containing relevant information
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)

X = train.drop('Survived', axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# DECISION TREE
dt_model = DecisionTree(criterion='entropy', max_depth=3)
dt_model.fit(X_train.values, y_train.values)
dt_predictions = dt_model.predict(X_test.values)
dt_accuracy = accuracy_score(y_test, dt_predictions)

print(f"Decision Tree Accuracy: {dt_accuracy}")

# RandomForest
dt_classifier = DecisionTree(criterion='entropy', max_depth=4, min_samples_split=2, min_samples_leaf=1)
random_forest = RandomForest(classifier=dt_classifier, num_trees=11, min_features=5)
random_forest.fit(X_train.values, y_train.values)
predictions = random_forest.predict(X_test.values)
accuracy = accuracy_score(y_test, predictions)

print(f"Random Forest Accuracy: {accuracy}")

# AdaBoost
adaboost_classifier = AdaBoost(weak_learner=DecisionTree, num_learners=100, learning_rate=1.0)
adaboost_classifier.fit(X_train.values, y_train.values)
predictions = adaboost_classifier.predict(X_test.values)
accuracy = accuracy_score(y_test, predictions)
print(f"Boosting Accuracy: {accuracy}")


Decision Tree Accuracy: 0.8212290502793296
Random Forest Accuracy: 0.6033519553072626
Boosting Accuracy: 0.8156424581005587
