In [38]:
# Import necessary libraries
import numpy as np
import pandas as pd

# Define the DecisionTree class
class DecisionTree:

    # Initialize the tree with optional max_depth and current depth
    def __init__(self, max_depth = 6, depth = 1):
        self.max_depth = max_depth # Maximum depth of the tree
        self.depth = depth # Current depth of the node
        self.left = None # Left child node
        self.right = None # Right child node

    # Fit the decision tree model to the data
    def fit(self, data, target):
        # Stop splitting if max depth is reached
        if self.depth <= self.max_depth:
            print(f"processing at Depth: {self.depth}") # Print current processing depth
            self.data = data # Store the data
            self.target = target # Store the target variable name
            self.independent = self.data.columns.tolist() # Get independent variable names
            self.independent.remove(target) # Remove target variable from independent variables
            self.__validate_data() # Validate data types and target variable
            self.impurity_score = self.__calculate_impurity_score(self.data[self.target]) # Calculate impurity score for the current node
            # Find the best split point and feature
            self.criteria, self.split_feature, self.information_gain = self.__find_best_split()
            # Create branches if a valid split is found and information gain is positive
            if self.criteria is not None and self.information_gain > 0:
              self.__create_branches()
        else:
            print("Stopping splitting as Max depth reached") # Message when max depth is reached

    # Create left and right child branches
    def __create_branches(self):
        # Create new DecisionTree instances for left and right branches with incremented depth
        self.left = DecisionTree(max_depth = self.max_depth,
                                 depth = self.depth + 1)
        self.right = DecisionTree(max_depth = self.max_depth,
                                 depth = self.depth + 1)
        # Split the data based on the best split criteria and feature
        left_rows = self.data[self.data[self.split_feature] <= self.criteria]
        right_rows = self.data[self.data[self.split_feature] > self.criteria]
        # Recursively fit the left and right branches
        self.left.fit(data = left_rows, target = self.target)
        self.right.fit(data = right_rows, target = self.target)

    # Calculate the impurity score (Gini impurity for binary classification)
    def __calculate_impurity_score(self, data):
       # Return 0 if data is empty or None
       if data is None or data.empty: return 0
       # Calculate probability of the positive class (assuming binary classification)
       # The line below assumes the value_counts() will return exactly two values.
       # This could be improved to handle cases with more than two classes or missing classes in a subset.
       p_i, _ = data.value_counts().apply(lambda x: x/len(data)).tolist()
       # Calculate Gini impurity
       return p_i * (1 - p_i) * 2

    # Find the best split point and feature for the current node
    def __find_best_split(self):
        best_split = {} # Dictionary to store the best split information
        # Iterate through independent features
        for col in self.independent:
            # Find the best split for the current column
            information_gain, split = self.__find_best_split_for_column(col)
            # Skip if no valid split is found for the column
            if split is None: continue
            # Update best_split if current split has higher information gain
            if not best_split or best_split["information_gain"] < information_gain:
                best_split = {"split": split, "col": col, "information_gain": information_gain}

        # Return the best split criteria, feature, and information gain
        return best_split.get("split"), best_split.get("col"), best_split.get("information_gain")

    # Find the best split point for a specific column
    def __find_best_split_for_column(self, col):
        x = self.data[col] # Get the data for the column
        unique_values = x.unique() # Get unique values in the column
        # Return None if there's only one unique value (no split possible)
        if len(unique_values) == 1: return None, None
        information_gain = None # Initialize information gain
        split = None # Initialize split point
        # Iterate through unique values to find the best split
        for val in unique_values:
            # Split data based on the current value
            left = x <= val
            right = x > val
            left_data = self.data[left]
            right_data = self.data[right]
            # Calculate impurity for left and right branches
            left_impurity = self.__calculate_impurity_score(left_data[self.target])
            right_impurity = self.__calculate_impurity_score(right_data[self.target])
            # Calculate information gain for the split
            score = self.__calculate_information_gain(left_count = len(left_data),
                                                      left_impurity = left_impurity,
                                                      right_count = len(right_data),
                                                      right_impurity = right_impurity)
            # Update best information gain and split for the column
            if information_gain is None or score > information_gain:
                information_gain = score
                split = val
        # Return the best information gain and split for the column
        return information_gain, split

    # Calculate information gain
    def __calculate_information_gain(self, left_count, left_impurity, right_count, right_impurity):
        # Calculate weighted average impurity of branches and subtract from parent impurity
        return self.impurity_score - ((left_count/len(self.data)) * left_impurity + \
                                      (right_count/len(self.data)) * right_impurity)

    # Predict method
    def predict(self, data):
        # Apply the tree traversal to each row in the data
        return np.array([self.__flow_data_thru_tree(row) for _, row in data.iterrows()])

    # Method to flow data through the tree for prediction
    def __flow_data_thru_tree(self, row):
        # If it's a leaf node, return the probability distribution at this node
        if self.is_leaf_node:
            return self.probability
        # If not a leaf node, decide whether to go left or right based on the split criterion
        tree = self.left if row[self.split_feature] <= self.criteria else self.right
        # Recursively call __flow_data_thru_tree on the appropriate child node
        return tree.__flow_data_thru_tree(row)

    # Validate input data
    def __validate_data(self):
        # Check if independent columns are numeric (excluding boolean as they can be treated as numeric)
        # Removing 'bool' from the check as boolean columns can be treated as numeric (0 or 1).
        non_numeric_columns = self.data[self.independent].select_dtypes(include=['category', 'object']).columns.tolist()
        if(len(set(self.independent).intersection(set(non_numeric_columns))) != 0):
            raise RuntimeError("Not all columns are numeric")

        # Ensure the target variable is categorical and has exactly two categories
        self.data[self.target] = self.data[self.target].astype("category")
        if(len(self.data[self.target].cat.categories) != 2):
            raise RuntimeError("Implementation is only for Binary Classification")

    # Property to check if the current node is a leaf node
    @property
    def is_leaf_node(self): return self.left is None

    # Property to get the probability distribution at the current node
    @property
    def probability(self):
        # Calculate and return the probability distribution of the target variable
        return self.data[self.target].value_counts().apply(lambda x: x/len(self.data)).tolist()

In [39]:

import numpy as np
import pandas as pd

class DecisionTree:

    def __init__(self, max_depth = 6, depth = 1):
        self.max_depth = max_depth
        self.depth = depth
        self.left = None
        self.right = None

    def fit(self, data, target):
        if self.depth <= self.max_depth: print(f"processing at Depth: {self.depth}")
        self.data = data
        self.target = target
        self.independent = self.data.columns.tolist()
        self.independent.remove(target)
        if self.depth <= self.max_depth:
            self.__validate_data()
            self.impurity_score = self.__calculate_impurity_score(self.data[self.target])
            self.criteria, self.split_feature, self.information_gain = self.__find_best_split()
            if self.criteria is not None and self.information_gain > 0: self.__create_branches()
        else:
            print("Stopping splitting as Max depth reached")

    def __create_branches(self):
        self.left = DecisionTree(max_depth = self.max_depth,
                                 depth = self.depth + 1)
        self.right = DecisionTree(max_depth = self.max_depth,
                                 depth = self.depth + 1)
        left_rows = self.data[self.data[self.split_feature] <= self.criteria]
        right_rows = self.data[self.data[self.split_feature] > self.criteria]
        self.left.fit(data = left_rows, target = self.target)
        self.right.fit(data = right_rows, target = self.target)

    def __calculate_impurity_score(self, data):
       if data is None or data.empty: return 0
       p_i, _ = data.value_counts().apply(lambda x: x/len(data)).tolist()
       return p_i * (1 - p_i) * 2

    def __find_best_split(self):
        best_split = {}
        for col in self.independent:
            information_gain, split = self.__find_best_split_for_column(col)
            if split is None: continue
            if not best_split or best_split["information_gain"] < information_gain:
                best_split = {"split": split, "col": col, "information_gain": information_gain}

        return best_split.get("split"), best_split.get("col"), best_split.get("information_gain")

    def __find_best_split_for_column(self, col):
        x = self.data[col]
        unique_values = x.unique()
        if len(unique_values) == 1: return None, None
        information_gain = None
        split = None
        for val in unique_values:
            left = x <= val
            right = x > val
            left_data = self.data[left]
            right_data = self.data[right]
            left_impurity = self.__calculate_impurity_score(left_data[self.target])
            right_impurity = self.__calculate_impurity_score(right_data[self.target])
            score = self.__calculate_information_gain(left_count = len(left_data),
                                                      left_impurity = left_impurity,
                                                      right_count = len(right_data),
                                                      right_impurity = right_impurity)
            if information_gain is None or score > information_gain:
                information_gain = score
                split = val
        return information_gain, split

    def __calculate_information_gain(self, left_count, left_impurity, right_count, right_impurity):
        return self.impurity_score - ((left_count/len(self.data)) * left_impurity + \
                                      (right_count/len(self.data)) * right_impurity)

    def predict(self, data):
        return np.array([self.__flow_data_thru_tree(row) for _, row in data.iterrows()])

    def __validate_data(self):
        non_numeric_columns = self.data[self.independent].select_dtypes(include=['category', 'object', 'bool']).columns.tolist()
        if(len(set(self.independent).intersection(set(non_numeric_columns))) != 0):
            raise RuntimeError("Not all columns are numeric")

        self.data[self.target] = self.data[self.target].astype("category")
        if(len(self.data[self.target].cat.categories) != 2):
            raise RuntimeError("Implementation is only for Binary Classification")

    def __flow_data_thru_tree(self, row):
        if self.is_leaf_node: return self.probability
        tree = self.left if row[self.split_feature] <= self.criteria else self.right
        return tree.__flow_data_thru_tree(row)

    @property
    def is_leaf_node(self): return self.left is None

    @property
    def probability(self):
        return self.data[self.target].value_counts().apply(lambda x: x/len(self.data)).tolist()

In [40]:
# Load the train and test datasets
try:
    train_df = pd.read_csv('titanic_train_preprocessed.csv')
    test_df = pd.read_csv('titanic_test_preprocessed.csv')

    print("Titanic train data head:")
    display(train_df.head())

    print("\nTitanic test data head:")
    display(test_df.head())

except FileNotFoundError:
    print("Make sure 'titanic_train.csv' and 'titanic_test.csv' are in the correct directory.")
    print("If the filenames are different, please update the code accordingly.")

Titanic train data head:


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,Pclass_1,Pclass_2,Pclass_3,Master,Miss,Mr,Mrs,Officer,Royalty,Survived
0,22.0,7.25,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0.0
1,38.0,71.2833,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,1.0
2,26.0,7.925,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1.0
3,35.0,53.1,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,1.0
4,35.0,8.05,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0.0



Titanic test data head:


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,Family_Large,Pclass_1,Pclass_2,Pclass_3,Master,Miss,Mr,Mrs,Officer,Royalty
0,34.5,7.8292,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1,47.0,7.0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,62.0,9.6875,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,27.0,8.6625,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,22.0,12.2875,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [41]:
# Combine train and test data for consistent preprocessing
# We are dropping 'Survived' from train_df before concatenating
combined_df = pd.concat([train_df.drop('Survived', axis=1), test_df], ignore_index=True)

# The data is already preprocessed, so we will skip steps that handle missing values
# and feature engineering on original columns like Age, Fare, Embarked, SibSp, Parch, Name, Ticket, Cabin

# However, let's ensure categorical columns used for one-hot encoding are consistent
# based on the columns observed in the head of the preprocessed dataframes.
# We will keep the one-hot encoding step as a safety measure, but the columns listed
# might need adjustment based on how the preprocessed data was generated.
# Based on the column names in the head, it seems Embarked, Cabin, Pclass, Master, Miss, Mr, Mrs, Officer, Royalty
# are already one-hot encoded or derived.
# Let's re-evaluate which columns still need potential one-hot encoding based on the combined_df columns after loading.

# Let's see the columns in the combined_df after concatenation to decide which columns to one-hot encode if any
print("Columns in combined_df after concatenation:")
print(combined_df.columns)

# Based on the output of combined_df.columns and the original plan,
# it seems Sex, Embarked, Pclass, Cabin, and Title were intended for one-hot encoding.
# However, based on the preprocessed data head, Embarked, Pclass, Cabin and Title derived columns (like Master, Miss, Mr, Mrs, Officer, Royalty)
# seem to be already processed. Sex might be the only remaining original categorical column.

# Let's assume 'Sex' is the only remaining original categorical column that needs one-hot encoding.
# If there are other original categorical columns, they should be added to this list.
categorical_cols_to_encode = ['Sex'] # Adjust this list based on actual columns in combined_df

# Apply one-hot encoding if there are categorical columns to encode
if categorical_cols_to_encode:
    combined_df = pd.get_dummies(combined_df, columns=categorical_cols_to_encode, drop_first=True)
    print("\nColumns in combined_df after one-hot encoding:")
    print(combined_df.columns)


# The original drop list included columns that seem to be already handled in preprocessed data.
# We need to be careful not to drop columns that are now one-hot encoded features.
# Let's redefine the columns to drop based on columns that are not features needed for the model
# and were likely intended to be removed after feature engineering.
# Assuming 'PassengerId' is the only column left to drop that isn't a feature or target.
# If 'Name', 'Ticket', 'SibSp', 'Parch' still exist as original columns and are not needed, add them here.
columns_to_drop_after_preprocessing = ['PassengerId'] # Adjust based on actual columns in combined_df

# Drop unnecessary columns if they exist
columns_to_drop_existing = [col for col in columns_to_drop_after_preprocessing if col in combined_df.columns]
if columns_to_drop_existing:
    combined_df = combined_df.drop(columns_to_drop_existing, axis=1)
    print("\nColumns in combined_df after dropping unnecessary columns:")
    print(combined_df.columns)


# Separate the preprocessed train and test data
train_preprocessed_df = combined_df.iloc[:len(train_df)].copy()
test_preprocessed_df = combined_df.iloc[len(train_df):].copy()

# Add the 'Survived' column back to the training data
train_preprocessed_df['Survived'] = train_df['Survived'].values

print("\nPreprocessed Train Data Head:")
display(train_preprocessed_df.head())

print("\nPreprocessed Test Data Head:")
display(test_preprocessed_df.head())

Columns in combined_df after concatenation:
Index(['Age', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin_A',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_U', 'Sex', 'FamilySize', 'Family_Single',
       'Family_Small', 'Family_Large', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Master', 'Miss', 'Mr', 'Mrs', 'Officer', 'Royalty'],
      dtype='object')

Columns in combined_df after one-hot encoding:
Index(['Age', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin_A',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_U', 'FamilySize', 'Family_Single', 'Family_Small',
       'Family_Large', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Master', 'Miss',
       'Mr', 'Mrs', 'Officer', 'Royalty', 'Sex_1'],
      dtype='object')

Preprocessed Train Data Head:


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,Pclass_2,Pclass_3,Master,Miss,Mr,Mrs,Officer,Royalty,Sex_1,Survived
0,22.0,7.25,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,True,0.0
1,38.0,71.2833,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,False,1.0
2,26.0,7.925,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,False,1.0
3,35.0,53.1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,False,1.0
4,35.0,8.05,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,True,0.0



Preprocessed Test Data Head:


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,Pclass_1,Pclass_2,Pclass_3,Master,Miss,Mr,Mrs,Officer,Royalty,Sex_1
891,34.5,7.8292,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,True
892,47.0,7.0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,False
893,62.0,9.6875,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,True
894,27.0,8.6625,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,True
895,22.0,12.2875,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,False


In [42]:
# Separate features (X) and target (y) from the preprocessed training data
X_train = train_preprocessed_df.drop('Survived', axis=1)
y_train = train_preprocessed_df['Survived']

# The test data is already without the target variable
X_test = test_preprocessed_df.copy()

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

print("\nX_train head:")
display(X_train.head())

print("\ny_train head:")
display(y_train.head())

X_train shape: (891, 28)
y_train shape: (891,)
X_test shape: (418, 28)

X_train head:


Unnamed: 0,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,...,Pclass_1,Pclass_2,Pclass_3,Master,Miss,Mr,Mrs,Officer,Royalty,Sex_1
0,22.0,7.25,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,True
1,38.0,71.2833,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,False
2,26.0,7.925,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,False
3,35.0,53.1,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,False
4,35.0,8.05,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,True



y_train head:


Unnamed: 0,Survived
0,0.0
1,1.0
2,1.0
3,1.0
4,0.0


In [43]:
# Create an instance of the DecisionTree class
# You can adjust the max_depth parameter if needed
decision_tree_model = DecisionTree(max_depth=5)

# Train the decision tree model
print("Training the Decision Tree model...")

# Print the data types of the training data columns for debugging
print("\nData types of train_preprocessed_df columns:")
print(train_preprocessed_df.dtypes)

# Explicitly convert boolean columns to integer type as a workaround
# This is to address the persistent RuntimeError even after modifying the class validation
for col in train_preprocessed_df.columns:
    if train_preprocessed_df[col].dtype == 'bool':
        train_preprocessed_df[col] = train_preprocessed_df[col].astype('int64')
        print(f"Converted column '{col}' from bool to int64")

# Print data types again after conversion for verification
print("\nData types of train_preprocessed_df columns after conversion:")
print(train_preprocessed_df.dtypes)


decision_tree_model.fit(data=train_preprocessed_df, target='Survived')

print("\nDecision Tree model training complete.")
# Note: The DecisionTree class as implemented currently prints depth information during training.

Training the Decision Tree model...

Data types of train_preprocessed_df columns:
Age              float64
Fare             float64
Embarked_C         int64
Embarked_Q         int64
Embarked_S         int64
Cabin_A            int64
Cabin_B            int64
Cabin_C            int64
Cabin_D            int64
Cabin_E            int64
Cabin_F            int64
Cabin_G            int64
Cabin_T            int64
Cabin_U            int64
FamilySize         int64
Family_Single      int64
Family_Small       int64
Family_Large       int64
Pclass_1           int64
Pclass_2           int64
Pclass_3           int64
Master             int64
Miss               int64
Mr                 int64
Mrs                int64
Officer            int64
Royalty            int64
Sex_1               bool
Survived         float64
dtype: object
Converted column 'Sex_1' from bool to int64

Data types of train_preprocessed_df columns after conversion:
Age              float64
Fare             float64
Embarked_C         int

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


processing at Depth: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


processing at Depth: 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


processing at Depth: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 5
processing at Depth: 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


processing at Depth: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 5
Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 3
processing at Depth: 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


processing at Depth: 4
processing at Depth: 5
processing at Depth: 5
Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A va

processing at Depth: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


processing at Depth: 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


processing at Depth: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 5
Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 4
processing at Depth: 5
processing at Depth: 5
processing at Depth: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A va

processing at Depth: 4
processing at Depth: 5
Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 5
processing at Depth: 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A va

processing at Depth: 5
Stopping splitting as Max depth reached
Stopping splitting as Max depth reached
processing at Depth: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[self.target] = self.data[self.target].astype("category")


Stopping splitting as Max depth reached
Stopping splitting as Max depth reached

Decision Tree model training complete.


In [44]:
# Make predictions on the preprocessed test data
# Note: The current predict method returns overall probabilities, not per-row predictions.
# To get actual per-row predictions, the __flow_data_thru_tree method in the DecisionTree class needs to be fully implemented for tree traversal.
predictions = decision_tree_model.predict(data=X_test)

print("Predictions on test data:")
display(predictions)

Predictions on test data:


array([[0.94791667, 0.05208333],
       [0.59459459, 0.40540541],
       [0.94791667, 0.05208333],
       [0.88965517, 0.11034483],
       [0.59459459, 0.40540541],
       [0.88965517, 0.11034483],
       [0.59459459, 0.40540541],
       [1.        , 0.        ],
       [0.59459459, 0.40540541],
       [0.88965517, 0.11034483],
       [0.88965517, 0.11034483],
       [0.94791667, 0.05208333],
       [0.98076923, 0.01923077],
       [0.94791667, 0.05208333],
       [0.98076923, 0.01923077],
       [0.84210526, 0.15789474],
       [0.94791667, 0.05208333],
       [0.88965517, 0.11034483],
       [0.59459459, 0.40540541],
       [0.59459459, 0.40540541],
       [1.        , 0.        ],
       [0.59459459, 0.40540541],
       [0.98076923, 0.01923077],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.94791667, 0.05208333],
       [0.98076923, 0.01923077],
       [0.88965517, 0.11034483],
       [0.51219512, 0.48780488],
       [0.88965517, 0.11034483],
       [0.

#In this notebook, we implemented a custom Decision Tree class and applied it to the Titanic dataset.

1. We successfully loaded and preprocessed the train and test data, handling missing values and converting categorical features to a numerical format.

2. We then used our custom class to train a decision tree model on the preprocessed training data.

3. However, during the prediction phase, we observed that the output did not provide specific predictions for each individual in the test set.

4. This is because the predict method in our current Decision Tree implementation does not fully traverse the tree based on the input features of each test data point.

5. Instead, it returns the overall class distribution of the data within the nodes reached during the (incomplete) prediction process.

6. Therefore, the observed output represents the class proportions within various nodes of the trained tree, rather than true predictions for the test data.

7. To obtain actual predictions for each test case, the __flow_data_thru_tree method within the DecisionTree class would need to be fully implemented to perform a proper tree traversal based on the input features.