## Import dependencies and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

## Loading the dataset

In [2]:
df = pd.read_csv("Breast_Cancer.csv")
df.isnull().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
Differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Regional Node Positive    0
Survival Months           0
Status                    0
dtype: int64

## Encode the Categorical columns and represent them in Numerical values

In [3]:
for col in df.columns:
    if (col == 'Age'):
        continue
    else:
        feature_values = df[col].unique()
        print(f"{col}: {feature_values}")


Race: ['White' 'Black' 'Other']
Marital Status: ['Married' 'Divorced' 'Single ' 'Widowed' 'Separated']
T Stage : ['T1' 'T2' 'T3' 'T4']
N Stage: ['N1' 'N2' 'N3']
6th Stage: ['IIA' 'IIIA' 'IIIC' 'IIB' 'IIIB']
Differentiate: ['Poorly differentiated' 'Moderately differentiated' 'Well differentiated'
 'Undifferentiated']
Grade: ['3' '2' '1' ' anaplastic; Grade IV']
A Stage: ['Regional' 'Distant']
Tumor Size: [  4  35  63  18  41  20   8  30 103  32  13  59  15  19  46  24  25  29
  40  70  22  50  17  21  10  27  23   5  51   9  55 120  77   2  11  12
  26  75 130  34  80   3  60  14  16  45  36  76  38  49   7  72 100  43
  62  37  68  52  85  57  39  28  48 110  65   6 105 140  42  31  90 108
  98  47  54  61  74  33   1  87  81  58 117  44 123 133  95 107  92  69
  56  82  66  78  97  88  53  83 101  84 115  73 125 104  94  86  64  96
  79  67]
Estrogen Status: ['Positive' 'Negative']
Progesterone Status: ['Positive' 'Negative']
Regional Node Examined: [24 14  2  3 18 11  9 20 21 13 23 1

## Categorical columns
- Race: ['White' 'Black' 'Other'] <br>
- Marital Status: ['Married' 'Divorced' 'Single ' 'Widowed' 'Separated'] <br>
- T Stage : ['T1' 'T2' 'T3' 'T4'] <br>
- N Stage: ['N1' 'N2' 'N3'] <br>
- 6th Stage: ['IIA' 'IIIA' 'IIIC' 'IIB' 'IIIB'] <br>
- Differentiate: ['Poorly differentiated' 'Moderately differentiated' 'Well differentiated' 'Undifferentiated'] <br>
- Grade: ['3' '2' '1' ' anaplastic; Grade IV'] <br>
- A Stage: ['Regional' 'Distant'] <br>
- Estrogen Status: ['Positive' 'Negative'] <br>
- Progesterone Status: ['Positive' 'Negative'] <br>
- Status: ['Alive' 'Dead']

## Numerical Columns
- Tumor Size
- Regional Node Examined
- Regional Node Positive

In [4]:
df = pd.read_csv('Breast_Cancer.csv')
df.columns = df.columns.str.strip()

cat_cols = ['Race', 'Marital Status', 'T Stage', 'N Stage', '6th Stage',
                        'Differentiate', 'A Stage', 'Estrogen Status', 'Progesterone Status', 'Status']

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head(10)


Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,Differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months,Status
0,68,2,1,0,0,0,1,3,1,4,1,1,24,1,60,0
1,50,2,1,1,1,2,0,2,1,35,1,1,14,5,62,0
2,58,2,0,2,2,4,0,2,1,63,1,1,14,7,75,0
3,58,2,1,0,0,0,1,3,1,18,1,1,2,1,84,0
4,47,2,1,1,0,1,1,3,1,41,1,1,3,1,50,0
5,51,2,3,0,0,0,0,2,1,20,1,1,18,2,89,0
6,51,2,1,0,0,0,3,1,1,8,1,1,11,1,54,0
7,40,2,1,1,0,1,0,2,1,30,1,1,9,1,14,1
8,40,2,0,3,2,4,1,3,1,103,1,1,20,18,70,0
9,69,2,1,3,2,4,3,1,0,32,1,1,21,12,92,0


### Split dataset into FEATURES and TARGETS

In [5]:
X = df.drop(columns=['Status'])
y = df['Status']

## Calculation of gini impurity for Categorical features

In [10]:
def gini_impurity(column, target):
    unique_vals = column.unique()
    gini = 0

    for val in unique_vals:
        subset = target[column == val]
        prob = subset.value_counts(normalize=True) 
        gini += len(subset) / len(target) * (1 - sum(prob ** 2))

    return gini

## Traversing through all feature columns and calculating their Gini impurity

In [11]:
gini_scores = {}
for feature in X.columns:
    gini_scores[feature] = gini_impurity(X[feature], y)

# Sort the features by Gini impurity in ascending order
sorted_gini_scores = sorted(gini_scores.items(), key=lambda x: x[1])

# Display Gini scores for all features in ascending order
print("Gini impurity for each feature (sorted in ascending order):")
for feature, gini in sorted_gini_scores:
    print(f"{feature}: {gini:.3f}")

# Find the best feature (with the lowest Gini impurity) for the ROOT NODE
best_feature = sorted_gini_scores[0][0]
print(f"\nBest feature for the ROOT NODE: {best_feature}")


Gini impurity for each feature (sorted in ascending order):
Survival Months: 0.159
Regional Node Positive: 0.237
6th Stage: 0.241
N Stage: 0.242
Tumor Size: 0.247
Estrogen Status: 0.250
Progesterone Status: 0.251
Differentiate: 0.252
Grade: 0.252
T Stage: 0.253
Age: 0.254
Regional Node Examined: 0.255
A Stage: 0.257
Marital Status: 0.257
Race: 0.257

Best feature for the ROOT NODE: Survival Months


### DecisionTree = This class is designed to implement a custom decision tree algorithm <br>
### build_tree = Constructs the decision tree recursively <br>
### fit = Used to train the decision tree model. <br>
### predict_single = Used to train the decision tree model. <br>
### predict = Applies the predict_single method across all rows of the dataset using the trained decision tree model

In [8]:
class DecisionTree:
    def __init__(self, features=None, target=None):
        self.tree = None
    
    def build_tree(self, X, y):
        if len(y.unique()) == 1:
            return y.unique()[0]
        if X.shape[1] == 0:
            return y.mode()[0]
        
        gini_scores = {}
        for feature in X.columns:
            gini_scores[feature] = gini_impurity(X[feature], y)
        
        best_feature = min(gini_scores, key=gini_scores.get)
        tree = {best_feature: {}}
        
        unique_vals = X[best_feature].unique()
        for val in unique_vals:
            subset_X = X[X[best_feature] == val].drop(columns=[best_feature])
            subset_y = y[X[best_feature] == val]
            subtree = self.build_tree(subset_X, subset_y)
            tree[best_feature][val] = subtree
        
        return tree
    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
    
    def predict_single(self, row, tree):
        if not isinstance(tree, dict):
            return tree
        
        feature = next(iter(tree))
        feature_value = row[feature]
        
        if feature_value in tree[feature]:
            subtree = tree[feature][feature_value]
            return self.predict_single(row, subtree)
        else:
            return y.mode()[0]
    
    def predict(self, X):
        return X.apply(lambda row: self.predict_single(row, self.tree), axis=1)

## Splitting the dataset into Training and Testing
## Making predictions and calculating accuracy based on the Testing dataset

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

custom_tree = DecisionTree()
custom_tree.fit(X_train, y_train)

y_pred = custom_tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the custom decision tree model: {accuracy * 100:.2f}%")

Accuracy of the custom decision tree model: 83.73%


## Accuracy = 83.73%