Sarthak Wadhawan
Student ID: 1002028186
For: CSE-6363, Machine Learning

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn import tree
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from tabulate import tabulate

In [2]:
train = pd.read_csv('/Users/sarthakwadhawan/Documents/Subjects/UTA/ML-UTA/titanic/train.csv')
test = pd.read_csv('/Users/sarthakwadhawan/Documents/Subjects/UTA/ML-UTA/titanic/test.csv')

In [3]:
PassengerId = test['PassengerId']
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
original_train = train.copy() 
full_data = [train, test]

In [5]:
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    #IsAlone is 1 where FamilySize is 1

# Removing all NULLS from the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

# treating NULLS for the Fare column
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    
# treating NULLS in Age 
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    # Next line has been improved to avoid warning
    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

# extracting the titles
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)

# Grouping all non-common titles into "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] 

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)

Title VS Sex

In [6]:
train[['Title', 'Survived']].groupby(['Title'], as_index=False).agg(['mean', 'count', 'sum'])

Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,mean,count,sum
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0.156673,517,81
2,0.575,40,23
3,0.793651,126,100
4,0.702703,185,130
5,0.347826,23,8


In [7]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).agg(['mean', 'count', 'sum'])

Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,mean,count,sum
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.742038,314,233
1,0.188908,577,109


In [8]:
title_and_sex = original_train.copy()[['Name', 'Sex']]

# Create 'Title' feature
title_and_sex['Title'] = title_and_sex['Name'].apply(get_title)

# Mapping 'Sex' as binary feature
title_and_sex['Sex'] = title_and_sex['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# Table with 'Sex' distribution grouped by 'Title'
title_and_sex[['Title', 'Sex']].groupby(['Title'], as_index=False).agg(['mean', 'count', 'sum'])
# Here: 
# MEAN: percentage of men
# COUNT: total observations
# SUM: number of men


Unnamed: 0_level_0,Sex,Sex,Sex
Unnamed: 0_level_1,mean,count,sum
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Capt,1.0,1,1
Col,1.0,2,2
Countess,0.0,1,0
Don,1.0,1,1
Dr,0.857143,7,6
Jonkheer,1.0,1,1
Lady,0.0,1,0
Major,1.0,2,2
Master,1.0,40,40
Miss,0.0,182,0


getting the x and y train and test variables ready

In [9]:
y_train = train['Survived']
x_train = train.drop(['Survived'], axis=1).values 
x_test = test.values

In [10]:
adaboost_x_train = x_train
adaboost_x_test = x_train
adaboost_y_train = y_train

In [11]:
def compute(data, k):
    return np.sum(data == k) / data.shape[0]

def misclass_rate(preds, target):
    pi_target = compute(preds, target)
    return 1 - pi_target

def entropy(targets, n_classes):
    result = 0
    for i in range(n_classes):
        pi_c = compute(targets, i)
        if pi_c > 0:
            result -= pi_c * np.log2(pi_c)

    return result

def gini_index(targets, n_classes):
    sum = 0
    for i in range(n_classes):
        pi_c = compute(targets, i)
        sum += pi_c**2
    return 1 - sum


In [12]:
class Node:
    def __init__(self, feature_indices=None, split=None, predicted_class=None, num_samples=None):
        self.feature_indices = feature_indices
        self.split = split
        self.predicted_class = predicted_class
        self.left = None
        self.right = None
        self.is_leaf = False
        self.num_samples = num_samples

    def set_para(self, feature_indices, split):
        print(feature_indices, split)
        self.feature_indices = feature_indices
        self.split = split
    
    def set_children(self, left, right):
        self.left = left
        self.right = right


In [13]:
class DecisionTree:
  def __init__(self, criterion, max_depth,min_samples_split,min_samples_leaf):
    
    # criterion - Either misclassification rate, Gini impurity, or entropy.
    # max_depth - The maximum depth the tree should grow.
    # min_samples_split - The minimum number of samples required to split.
    # min_samples_leaf - The minimum number of samples required for a leaf node.
    
    self.criterion = criterion
    self.max_depth= max_depth
    self.min_samples_split = min_samples_split
    self.min_samples_leaf = min_samples_leaf
    self.root = None

  def fit(self,X,y):
    #This method take two parameters:
    # X - The data of size (n_samples, n_features).
    # y - The labels of size (n_samples).

    self.n_classes_ = len(np.unique(y)) 
    self.classes_ = np.unique(y) 
    self.n_samples, self.n_features_ = X.shape
    self.root = self.grow_decision_tree(X, y)
    return self

  def grow_decision_tree(self,X,y,depth = 0):
    num_samples_per_class = [sum(y == i) for i in range(self.n_classes_)] 
    predicted_class = np.argmax(num_samples_per_class) 
    node = Node(predicted_class=predicted_class, num_samples=y.size) 
    stopping_condition = depth < self.max_depth and y.size >= self.min_samples_split 
    if stopping_condition:
      best_gain = 0.0 
      best_criteria = None
      best_sets = None
      for feature_indices in range(self.n_features_): 
        feature_values = X[:, feature_indices]
        possible_splits = self.split_the_data(feature_values) 
        for split in possible_splits: 
          left_indices = feature_values <= split 
          right_indices = feature_values > split 
          if np.sum(left_indices) > 0 and np.sum(right_indices) > 0: 
            y_left = y[left_indices]
            y_right = y[right_indices]
            gain = self._criterion_gain(y, y_left, y_right) 
            if gain > best_gain:  
              best_gain = gain
              best_criteria = (feature_indices, split)
              best_sets = (left_indices, right_indices)
      if best_gain > 0.0:
        left = self.grow_decision_tree(X[best_sets[0]], y[best_sets[0]], depth + 1) 
        right = self.grow_decision_tree(X[best_sets[1]], y[best_sets[1]], depth + 1) 
        node.set_para(best_criteria[0],best_criteria[1]) 
        node.set_children(left,right) 
        node.num_samples = None
      else:
        node.is_leaf = True 
        node.set_children(None,None)
    return node

  def split_the_data(self, feature_array):    
    unique_vals = np.unique(feature_array)
    if len(unique_vals) == 1:
        return []
    mp = (unique_vals[:-1] + unique_vals[1:]) / 2
    return mp

  def _criterion_gain(self, y, y_left, y_right):
    parent_score = self.criterion_function(y)
    left_score = self.criterion_function(y_left)
    right_score = self.criterion_function(y_right)
    fl = y_left.size / y.size
    fr = y_right.size / y.size
    gain = parent_score - (fl * left_score + fr * right_score)
    return gain

  def criterion_function(self, y):

    if self.criterion == 'gini':
      _, l = np.unique(y, return_counts=True)
      probs = l / y.size
      cal_score = 1.0 - np.sum(probs ** 2)
    elif self.criterion == 'entropy':
      _, l = np.unique(y, return_counts=True)
      probs = l / y.size
      cal_score = -np.sum(probs * np.log2(probs))
    elif self.criterion == 'misclassification error':
      _, l = np.unique(y, return_counts=True)
      probs = l / y.size
      cal_score = 1.0 - np.max(probs)
    else:
        raise ValueError("Invalid criterion. Allowed values: 'gini', 'entropy', 'misclassification error'.")
    return cal_score
  
  def predict(self, X):

    n_samples = X.shape[0]
    y_pred = np.zeros((n_samples,))
    for i in range(n_samples):
      y_pred[i] = self.traversing_all_nodes(X[i], self.root)
    return y_pred

  def traversing_all_nodes(self, x, node):
      if node.is_leaf:
        return node.predicted_class
      if node.split is None:
          return node.predicted_class
      if x[node.feature_indices] <= node.split:
        return self.traversing_all_nodes(x, node.left)
      else:
        return self.traversing_all_nodes(x, node.right)


In [14]:
X = np.array([[1, 2], [2, 1], [3, 4], [4, 3]])
y = np.array([0, 0, 1, 1])

# create a decision tree and fit it to the data
tree = DecisionTree(criterion='entropy', max_depth=2, min_samples_split=2, min_samples_leaf=1)
tree.fit(X, y)

# predict the class labels for the test data
y_pred = tree.predict(x_test)

print('Predicted class labels:', y_pred)


0 2.5
Predicted class labels: [1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0.
 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 1.
 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1.
 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 0.
 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1.
 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0.
 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0.
 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1.
 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1.
 1. 1. 0. 1. 0. 0. 1.

In [15]:
# generating a sample dataset
M, n = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
X_train, X_test, y_tr, y_test = train_test_split(M, n, test_size=0.2, random_state=42)
dt = DecisionTree(criterion='gini', max_depth=2, min_samples_split=2, min_samples_leaf=1)
dt.fit(X_train, y_tr)
y_pred = dt.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print('Accuracy for sample dataset:', accuracy*100, "%")
# print(dt.tree)

8 -1.9120663861844729
2 -0.03711037515664706
6 -0.34836910090870504
Accuracy for sample dataset: 85.0 %


In [16]:
dt = DecisionTree(criterion='gini', max_depth=2, min_samples_split=2, min_samples_leaf=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print('Accuracy for Titanic test data:', accuracy*100, "%")

0 2.0
Accuracy for Titanic test data: 100.0 %


In [17]:
class RandomForest:
    def __init__(self, classifier, num_trees, min_features):
        self.classifier = classifier
        self.num_trees = num_trees
        self.min_features = min_features
        self.trees = []
        self.feature_indices = None

    def fit(self, X, y):
        num_samples = X.shape[0]
        num_features = X.shape[1]

        for i in range(self.num_trees):
            # Sample data with replacement
            sample_indices = np.random.choice(num_samples, num_samples, replace=True)

            # Select a random subset of features
            num_selected_features = np.random.randint(self.min_features, num_features + 1)
            self.feature_indices = np.random.choice(num_features, num_selected_features, replace=False)

            # Extract the selected samples and features from the dataset
            X_subset = X[sample_indices][:, self.feature_indices]
            y_subset = y[sample_indices]

            # Create a new classifier object
            clf = self.classifier

            # Fit the classifier on the selected subset of data
            clf.fit(X_subset, y_subset)

            # Add the trained classifier to the list of trees
            self.trees.append(clf)

    def predict(self, X):
        # Predict the class label for each sample using each tree in the forest
        predictions = np.array([tree.predict(X[:, self.feature_indices]) for tree in self.trees]).T
        
        # Convert predictions to integer format
        predictions = predictions.astype(int)
        
        # Return the prediction with the most votes
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=predictions)



In [18]:
dt = DecisionTree(criterion='gini', max_depth=7, min_samples_split=2, min_samples_leaf=1)
rf = RandomForest(classifier=dt, num_trees=10, min_features=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)
accuracy_rf = np.mean(y_pred == y_train)
print('Accuracy for Titanic with Random Forrest:', accuracy_rf*100, "%")
y_pred = rf.predict(x_test)
# print("Predicted Class Label:", y_pred)

0 2.5
0 2.5
0 2.5
0 3.0
0 2.5
0 2.5
0 2.5
Accuracy for Titanic with Random Forrest: 100.0 %


In [19]:
y_pred_test = rf.predict(x_test)
accuracy_test = np.mean(y_pred_test == y_test)
print('Test Accuracy:', accuracy_test*100,"%")

Test Accuracy: 100.0 %


In [20]:
class AdaBoost:
    def __init__(self, weak_learner, num_learners, learning_rate):
        self.weak_learner = weak_learner
        self.num_learners = num_learners
        self.learning_rate = learning_rate
        self.learners = []
        self.weights = []

    def fit(self, X, y):
        # Initialize weights
        n_samples = X.shape[0]
        self.weights = np.full(n_samples, 1/n_samples)

        for i in range(self.num_learners):
            # Weight training data by sample weights
            # print(f"x={X.shape},w={self.weights.shape}")
            weighted_X = np.multiply(X, np.array(self.weights).reshape(-1, 1))
            # print(f"y={y.shape},wx={weighted_X.shape}")

            # Train weak learner on weighted data
            learner = self.weak_learner.fit(weighted_X, y)

            # Predict on training data and calculate error
            y_pred = learner.predict(X)
            error = np.sum(self.weights * (y_pred != y)) / np.sum(self.weights)

            # Calculate learner weight and update sample weights
            learner_weight = self.learning_rate * np.log((1 - error) / error)
            self.weights *= np.exp(-learner_weight * y * y_pred)

            # Add learner and weight to ensemble
            self.learners.append(learner)
            self.weights /= np.sum(self.weights)

            # Check if perfect fit is achieved and stop early
            if error == 0:
                break

    def predict(self, X):
        # Predict using weighted ensemble of learners
        # print(f"x={X.shape},w={self.weights.shape}")
        learner_preds = [learner.predict(X) for learner in self.learners]

        weights = np.tile(np.array(self.weights)[:, np.newaxis], (1, len(self.learners)))
        # print(f"x={np.array(learner_preds).shape},w={weights.shape}")

        ensemble_pred = np.sign(np.sum(weights * np.array(learner_preds).T, axis=1))
        return ensemble_pred

In [21]:
# Create AdaBoost classifier with DecisionTree weak learner
adaboost = AdaBoost(weak_learner=DecisionTree(criterion='entropy', max_depth=18, min_samples_split=2, min_samples_leaf=1),
                    num_learners=50,
                    learning_rate=0.1)

# Train AdaBoost classifier on training data
adaboost.fit(adaboost_x_train, adaboost_y_train)
y_pred = adaboost.predict(adaboost_x_train)
accuracy_train = np.mean(y_pred == adaboost_y_train)
print('Training Accuracy:', accuracy_train*100,"%")

5 0.0005611672278338945
5 0.0005611672278338945
2 0.0016835016835016836
3 0.0005611672278338945
5 0.0005611672278338945
5 0.0005611672278338945
2 0.0016835016835016836
2 0.0016835016835016836
3 0.0005611672278338945
6 0.0028058361391694727
2 0.0016835016835016836
5 0.0005611672278338945
6 0.003928170594837262
6 0.0056116722783389455
6 0.0016835016835016836
4 0.0028058361391694727
4 0.001122334455667789
4 0.0028058361391694727
6 0.0028058361391694727
3 0.0005611672278338945
3 0.0016835016835016836
4 0.0028058361391694727
5 0.0005611672278338945
2 0.038159371492704826
2 0.08473625140291807
2 0.0028058361391694727
2 0.0005611672278338945
6 0.0016835016835016836
3 0.0005611672278338945
6 0.0016835016835016836
4 0.0016835016835016836
5 0.0005611672278338945
4 0.0005611672278338945
4 0.0028058361391694727
6 0.0016835016835016836
5 0.0016835016835016836
5 0.0005611672278338945
2 0.0005611672278338945
2 0.0005611672278338945
5 0.0005611672278338945
6 0.0016835016835016836
4 0.00056116722783389

In [22]:
y_pred = adaboost.predict(adaboost_x_test)
# print("Predicted Class Label:", y_pred)
adaboost.fit(adaboost_x_test, adaboost_y_train)
y_pred_test = adaboost.predict(adaboost_x_test)
accuracy_test = np.mean(y_pred_test == adaboost_y_train)
print('Testing Accuracy:', accuracy_test*100,"%")

5 0.0005611672278338945
5 0.0005611672278338945
2 0.0016835016835016836
3 0.0005611672278338945
5 0.0005611672278338945
5 0.0005611672278338945
2 0.0016835016835016836
2 0.0016835016835016836
3 0.0005611672278338945
6 0.0028058361391694727
2 0.0016835016835016836
5 0.0005611672278338945
6 0.003928170594837262
6 0.0056116722783389455
6 0.0016835016835016836
4 0.0028058361391694727
4 0.001122334455667789
4 0.0028058361391694727
6 0.0028058361391694727
3 0.0005611672278338945
3 0.0016835016835016836
4 0.0028058361391694727
5 0.0005611672278338945
2 0.038159371492704826
2 0.08473625140291807
2 0.0028058361391694727
2 0.0005611672278338945
6 0.0016835016835016836
3 0.0005611672278338945
6 0.0016835016835016836
4 0.0016835016835016836
5 0.0005611672278338945
4 0.0005611672278338945
4 0.0028058361391694727
6 0.0016835016835016836
5 0.0016835016835016836
5 0.0005611672278338945
2 0.0005611672278338945
2 0.0005611672278338945
5 0.0005611672278338945
6 0.0016835016835016836
4 0.00056116722783389

In [23]:
headers = ["Model", "Accuracy"]
rows = [
    ["Decision Tree", f"{accuracy:.2%}"],
    ["Random Forest", f"{accuracy_rf:.2%}"],
    ["AdaBoost", f"{accuracy_test:.2%}"],
]
print(tabulate(rows, headers=headers))

Model          Accuracy
-------------  ----------
Decision Tree  100.00%
Random Forest  100.00%
AdaBoost       61.62%
