In [None]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.decision_tree import DecisionTree
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation

config = breast_cancer_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)

### Data Preprocessing ###

In [None]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

data_2 = data_1.drop(columns=['Sample code number'])

In [None]:
data_2

### Example Splitting Calculations ###

In [20]:
data_train, data_val = cross_validator.random_partition(data_2, random_state=42)

In [21]:
decision_tree = DecisionTree(config,data_2)
features = data_train.drop(columns=config['target_column'])
labels = data_train[config['target_column']]

decision_tree.calc_entropy(features)

7.955941732755594

In [22]:
decision_tree.calc_gain_ratio(labels,features)

{'Clump Thickness': {'gain_ratio': 0.4858306701440357, 'threshold': 6.5},
 'Uniformity of Cell Size': {'gain_ratio': 0.6152259553720826,
  'threshold': 3.5},
 'Uniformity of Cell Shape': {'gain_ratio': 0.6030369782062516,
  'threshold': 3.5},
 'Marginal Adhesion': {'gain_ratio': 0.4174219176453562, 'threshold': 3.5},
 'Single Epithelial Cell Size': {'gain_ratio': 0.5043631591386675,
  'threshold': 2.5},
 'Bare Nuclei': {'gain_ratio': 0.5676568165704948, 'threshold': 5.5},
 'Bland Chromatin': {'gain_ratio': 0.5539295244737114, 'threshold': 3.5},
 'Normal Nucleoli': {'gain_ratio': 0.5224618816828783, 'threshold': 2.5},
 'Mitoses': {'gain_ratio': 0.29726114483048427, 'threshold': 1.5}}

In [23]:
decision_tree.select_feature_gain_ratio(labels,features)

('Uniformity of Cell Size', 0.6152259553720826, 3.5)

### Cross Validation Example ###

In [24]:
from copy import deepcopy

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=1, random_state=42, stratify=True)):
    train_data = train_set.drop(columns=config['target_column'])
    train_target = train_set[config['target_column']]
    test_features = test_set.drop(columns=config['target_column'])
    test_true_vals = test_set[config['target_column']]

    # Decision Tree Model for classification
    decision_tree = DecisionTree(config, data_2)
    decision_tree.root = decision_tree.build_classification_tree(train_data, train_target)
    predictions = decision_tree.predict(test_features)

    score = Evaluation().zero_one_loss(test_true_vals, predictions)
    print("DT Score:", score)

    # Pruning the Decision Tree
    # Create a deep copy of the decision tree for pruning
    pruned_decision_tree = deepcopy(decision_tree)
    pruned_decision_tree.prune(pruned_decision_tree.root, data_val)  
    pruned_predictions = pruned_decision_tree.predict(test_features)

    pruned_score = Evaluation().zero_one_loss(test_true_vals, pruned_predictions)
    print("Pruned DT Score:", pruned_score)



DT Score: 0.075
Pruned DT Score: 0.08571428571428572
DT Score: 0.06810035842293907
Pruned DT Score: 0.03942652329749104


In [25]:
test_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
51,5,3,3,4,2,4.000000,3,4,1,4
544,2,1,3,2,2,1.000000,2,1,1,2
302,10,10,10,7,9,10.000000,7,10,10,4
235,3,1,4,1,2,3.544656,3,1,1,2
18,10,7,7,6,4,10.000000,4,1,2,4
...,...,...,...,...,...,...,...,...,...,...
214,10,10,10,10,3,10.000000,10,6,1,4
466,10,6,6,2,4,10.000000,9,7,1,4
121,4,2,1,1,2,2.000000,3,1,1,2
614,2,1,1,1,1,1.000000,2,1,1,2


In [26]:
train_set

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
82,5,2,1,1,2,1.0,3,1,1,2
220,1,1,1,2,2,1.0,3,1,1,2
559,5,1,1,1,2,1.0,2,1,1,2
552,3,2,2,2,2,1.0,4,2,1,2
215,8,7,8,7,5,5.0,5,10,2,4
...,...,...,...,...,...,...,...,...,...,...
20,7,3,2,10,5,10.0,5,4,4,4
71,6,10,2,8,10,2.0,7,8,10,4
106,10,10,10,8,2,10.0,4,1,1,4
435,10,8,10,1,3,10.0,5,1,1,4


In [27]:
decision_tree.print_tree()

Root - Decision: Uniformity of Cell Size <= 3.5
  Uniformity of Cell Size <=  3.5 - Decision: Bare Nuclei <= 5.5
    Bare Nuclei <=  5.5 - Decision: Clump Thickness <= 9.0
      Clump Thickness <=  9.0 - Decision: Normal Nucleoli <= 3.5
        Normal Nucleoli <=  3.5 - Decision: Bare Nuclei <= 3.772327964860908
          Bare Nuclei <=  3.772327964860908 - Leaf, value: 2
          Bare Nuclei >  3.772327964860908 - Decision: Clump Thickness <= 5.5
            Clump Thickness <=  5.5 - Decision: Single Epithelial Cell Size <= 1.5
              Single Epithelial Cell Size <=  1.5 - Leaf, value: 4
              Single Epithelial Cell Size >  1.5 - Leaf, value: 2
            Clump Thickness >  5.5 - Leaf, value: 4
        Normal Nucleoli >  3.5 - Decision: Clump Thickness <= 4.5
          Clump Thickness <=  4.5 - Leaf, value: 2
          Clump Thickness >  4.5 - Leaf, value: 4
      Clump Thickness >  9.0 - Leaf, value: 4
    Bare Nuclei >  5.5 - Decision: Marginal Adhesion <= 4.5
      

In [28]:
pruned_decision_tree.print_tree()

Root - Decision: Uniformity of Cell Size <= 3.5
  Uniformity of Cell Size <=  3.5 - Decision: Bare Nuclei <= 5.5
    Bare Nuclei <=  5.5 - Decision: Clump Thickness <= 9.0
      Clump Thickness <=  9.0 - Decision: Normal Nucleoli <= 3.5
        Normal Nucleoli <=  3.5 - Leaf, value: 2
        Normal Nucleoli >  3.5 - Decision: Clump Thickness <= 4.5
          Clump Thickness <=  4.5 - Leaf, value: 2
          Clump Thickness >  4.5 - Leaf, value: 4
      Clump Thickness >  9.0 - Leaf, value: 4
    Bare Nuclei >  5.5 - Leaf, value: 4
  Uniformity of Cell Size >  3.5 - Decision: Uniformity of Cell Size <= 4.5
    Uniformity of Cell Size <=  4.5 - Decision: Single Epithelial Cell Size <= 6.5
      Single Epithelial Cell Size <=  6.5 - Leaf, value: 4
      Single Epithelial Cell Size >  6.5 - Leaf, value: 2
    Uniformity of Cell Size >  4.5 - Leaf, value: 4


### Tree Traversal ###

In [29]:
import pandas as pd
test_instance = pd.DataFrame(test_features.iloc[5],index=None).T

In [30]:
test_instance

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
264,7.0,9.0,4.0,10.0,10.0,3.0,5.0,3.0,3.0


In [31]:
decision_tree.predict_verbose(test_instance)

Decision node at depth 0: Uniformity of Cell Size <= 3.5?
No, proceed to right child...
  Decision node at depth 1: Uniformity of Cell Size <= 4.5?
  No, proceed to right child...
    Decision node at depth 2: Bare Nuclei <= 8.5?
    Yes, proceed to left child...
      Decision node at depth 3: Clump Thickness <= 6.5?
      No, proceed to right child...
        Reached leaf node with prediction: 4




264    Decision node at depth 0: Uniformity of Cell S...
dtype: object