In [None]:
'''
# File: custom_decisiontree.ipynb
# Author: Ryan Tso
# Created: Thur Jan 8, 2026
# Last Updated: Jan 12, 2026

Description: Building custom decision tree python script so I can prioritize the consideration of
morphological features first then IHC features for the classification of RCC subtypes
    Approach - 2 stage gated model
'''

'\n# File: custom_decisiontree.ipynb\n# Author: Ryan Tso\n# Created: Thur Jan 8, 2026\n# Last Updated: Jan 8, 2026\n\nDescription: Building custom decision tree python script so I can prioritize the consideration of\nmorphological features first then IHC features for the classification of RCC subtypes\n    Approach - 2 stage gated model\n'

In [7]:
import numpy as np

In [None]:
class DecisionTree():
    '''
    Decision Tree Classifier
        Training : use "train" function with train set features and labels
        Predicting : Use "predict" function with test set features 
    '''

    def __init__(self, max_depth=4, min_samples_leaf=1,
                min_information_gain=0.0, numb_of_features_splitting=None): 
        '''
        Setting the class with hyperparameters 
        (This constructor responsible for setting up initial state of new instance)
        '''

        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.min_information_gain = min_information_gain
        self.numb_of_features_splitting = numb_of_features_splitting

        return None
    
    def _class_probabilities(self, labels:list) -> list: 
        '''
        Calculate class / predicting label probability 
        Helps with calculating entropy and information gain 
            Ex: labels = [0,0,1,1,1]
            probability = [0.4,0.6] --> 40% class 0, 60% class 1
        '''
        total_count = len(labels)
        labels = np.array(labels) # convert to np array (vectorized) improves speed

        print(f"Total number of Samples: {total_count}")

        unique_labels, counts = np.unique(labels, return_counts=True)

        print(f"Unique Labels: {unique_labels}")
        print(f"Counts for each Label: {counts}")

        class_probability = counts / total_count

        print(f"Probability of Each Class: {class_probability}")
        return class_probability

    def _entropy(self, class_probabilities: list) -> float:
        '''
        Calcuate shannon entropy 
        '''
        entropy = 0                         # accumulator variable 

        for probability in class_probabilities:
            if probability > 0:                     # avoids log2(0) -> undefined
                p = probability
                contribution = -p * np.log2(p)      # Shannon Entropy
                print(f"Contribution: {contribution}")
                entropy += contribution
                print(f"Entropy: {entropy}")

                # Production Code for faster performance
                # entropy = sum(-p * np.log2(p) for p in class_probability if p > 0)
        
        print(f"Final Entropy: {entropy}")
  
        return entropy
    
    def _data_entropy(self, labels:list) -> float:
        '''
        Entropy calculated from raw labels using _class_prbabilities function
        Combines the preprocessing step to calculate probabilities and entropy calculation
        '''
        data_entropy = self._entropy(self._class_probabilities(labels))
                                
        return data_entropy
    
    def _information_gain(self, X, y, thresh):
        return None

    def _partition_entropy(self, subsets:list) -> float:
        return None

    def _split(self, data: np.array, feature_idx: int, feature_val: float) -> tuple:
        return None
    
    def _select_features_to_use(self, data:np.array) -> list:
        return None
    
    def _find_best_split(self, data: np.array) -> tuple:
        return None
    
    def _find_label_probs(self, data: np.array) -> np.array:
        return None
    
    def _create_tree(self, data: np.array, current_depth: int) -> TreeNode:
        return None

    def _predict_one_sample(self, X:np.array) -> np.array:
        return None
    
    def train(self, X_train: np.array, Y_train: np.array) -> None:
        return None
    
    def predict_proba(self, X_set: np.array) -> np.array:
        return None
    
    def predict(self, X_set: np.array) -> np.array:
        return None
    
    # Tree Node requires an external import - see if we can replace... 
    def _print_recursive(self,node:TreeNode, level=0) -> None:
        return None

    def print_tree(self) -> None:
        return None

    def _calculate_feature_importance(self,node):
        return None


In [None]:
# Workspace before adding it into the main class - entropy
import numpy as np

labels = [0,0,1,1,1]

total_count = len(labels)
labels = np.array(labels) # convert to np array (vectorized) improves speed

print(f"Total number of Samples: {total_count}")

unique_labels, counts = np.unique(labels, return_counts=True)

print(f"Unique Labels: {unique_labels}")
print(f"Counts for each Label: {counts}")

class_probability = counts / total_count

print(f"Probability of Each Class: {class_probability}")

# Entropy Function
entropy = 0                         # accumulator variable 

for probability in class_probability:
    if probability > 0:                     # avoids log2(0) -> undefined
        p = probability
        contribution = -p * np.log2(p)      # Shannon Entropy
        print(f"Contribution: {contribution}")
        entropy += contribution
        print(f"Entropy: {entropy}")

        # Production Code for faster performance
        # entropy = sum(-p * np.log2(p) for p in class_probability if p > 0)

print(f"Final Entropy: {entropy}")





Total number of Samples: 5
Unique Labels: [0 1]
Counts for each Label: [2 3]
Probability of Each Class: [0.4 0.6]
Contribution: 0.5287712379549449
Entropy: 0.5287712379549449
Contribution: 0.44217935649972373
Entropy: 0.9709505944546686
Final Entropy: 0.9709505944546686
