In [48]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.decision_tree import DecisionTree
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation

config = forest_fires_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)

### Data Preprocessing ###

In [49]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)


In [50]:
data_1

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


### Example Splitting Calculations ###

In [51]:
data_train, data_val = cross_validator.random_partition(data_1, random_state=42)

In [52]:
decision_tree = DecisionTree(config,data_1)
features = data_train.drop(columns=config['target_column'])
labels = data_train[config['target_column']]

best_mse, best_threshold = decision_tree.calculate_mse(data_1,labels,'FFMC')

print("Best MSE:",best_mse, " Best Threshold:", best_threshold)

Best MSE: 2052.7785175576546  Best Threshold: 94.69999999999999


In [53]:
decision_tree.select_feature_mse(features,labels)

('temp', 2024.2787823356155, 26.049999999999997)

### Cross Validation Example ###

In [55]:
from copy import deepcopy

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=1, random_state=42, stratify=False)):
    train_data = train_set.drop(columns=config['target_column'])
    train_target = train_set[config['target_column']]
    test_features = test_set.drop(columns=config['target_column'])
    test_true_vals = test_set[config['target_column']]

    # Decision Tree Model for classification
    decision_tree = DecisionTree(config, data_1)
    decision_tree.root = decision_tree.build_regression_tree(train_data, train_target)
    predictions = decision_tree.predict(test_features)

    score = Evaluation().zero_one_loss(test_true_vals, predictions)
    print("DT Score:", score)

    # Pruning the Decision Tree
    # Create a deep copy of the decision tree for pruning
    pruned_decision_tree = deepcopy(decision_tree)
    pruned_decision_tree.prune(pruned_decision_tree.root, data_val)  
    pruned_predictions = pruned_decision_tree.predict(test_features)

    pruned_score = Evaluation().zero_one_loss(test_true_vals, pruned_predictions)
    print("Pruned DT Score:", pruned_score)



DT Score: 0.9178743961352657
Pruned DT Score: 0.6714975845410628
DT Score: 0.9077669902912622
Pruned DT Score: 0.6844660194174758


In [56]:
test_set

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
173,4,4,sep,mon,90.9,126.5,686.5,7.0,17.7,39,2.2,0.0,3.07
272,2,5,aug,tue,92.1,152.6,658.2,14.3,20.2,47,4.0,0.0,3.09
182,5,4,feb,sun,86.8,15.6,48.3,3.9,12.4,53,2.2,0.0,6.38
393,2,4,mar,tue,93.4,15.0,25.6,11.4,15.2,19,7.6,0.0,0.00
468,6,5,mar,thu,91.3,20.6,43.5,8.5,13.3,27,3.6,0.0,6.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,6,3,nov,tue,79.5,3.0,106.7,1.1,11.8,31,4.5,0.0,0.00
20,6,4,sep,tue,91.0,129.5,692.6,7.0,18.3,40,2.7,0.0,0.00
270,2,2,aug,tue,92.1,152.6,658.2,14.3,21.8,56,3.1,0.0,0.52
435,2,5,jul,sat,90.8,84.7,376.6,5.6,23.8,51,1.8,0.0,0.00


In [57]:
train_set

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
329,4,3,sep,sat,92.2,102.3,751.5,8.4,23.5,27,4.0,0.0,3.33
497,3,4,aug,tue,96.1,181.1,671.2,14.3,32.3,27,2.2,0.0,14.68
268,3,4,aug,tue,92.1,152.6,658.2,14.3,21.0,32,3.1,0.0,0.00
352,7,4,sep,fri,92.1,99.0,745.3,9.6,20.6,43,3.6,0.0,2.03
117,3,4,mar,sat,91.7,35.8,80.8,7.8,15.2,27,4.9,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,4,4,mar,sat,91.7,35.8,80.8,7.8,17.0,27,4.9,0.0,28.66
121,3,4,aug,mon,91.5,145.4,608.2,10.7,17.1,43,5.4,0.0,0.00
188,6,4,mar,sat,90.8,41.9,89.4,7.9,13.3,42,0.9,0.0,7.40
71,4,5,sep,fri,94.3,85.1,692.3,15.9,17.7,37,3.6,0.0,0.00


In [58]:
decision_tree.print_tree()

Root - Decision: DMC <= 103.1
  DMC <=  103.1 - Decision: temp <= 4.85
    temp <=  4.85 - Leaf, value: 16.55
    temp >  4.85 - Decision: Y <= 4.5
      Y <=  4.5 - Decision: X <= 3.5
        X <=  3.5 - Decision: DC <= 747.9
          DC <=  747.9 - Decision: ISI <= 13.8
            ISI <=  13.8 - Decision: RH <= 40.5
              RH <=  40.5 - Decision: DMC <= 7.95
                DMC <=  7.95 - Leaf, value: 1.1
                DMC >  7.95 - Leaf, value: 0.04583333333333334
              RH >  40.5 - Decision: temp <= 11.95
                temp <=  11.95 - Leaf, value: 5.55
                temp >  11.95 - Decision: FFMC <= 89.85
                  FFMC <=  89.85 - Leaf, value: 0.0
                  FFMC >  89.85 - Decision: DC <= 732.75
                    DC <=  732.75 - Decision: DMC <= 92.30000000000001
                      DMC <=  92.30000000000001 - Leaf, value: 1.7799999999999998
                      DMC >  92.30000000000001 - Leaf, value: 3.5
                    DC >  732.7

In [59]:
pruned_decision_tree.print_tree()

Root - Decision: DMC <= 103.1
  DMC <=  103.1 - Decision: temp <= 4.85
    temp <=  4.85 - Leaf, value: 16.55
    temp >  4.85 - Decision: Y <= 4.5
      Y <=  4.5 - Decision: X <= 3.5
        X <=  3.5 - Leaf, value: 0.0
        X >  3.5 - Decision: DMC <= 49.3
          DMC <=  49.3 - Leaf, value: 0.0
          DMC >  49.3 - Decision: FFMC <= 84.6
            FFMC <=  84.6 - Leaf, value: 11.16
            FFMC >  84.6 - Decision: ISI <= 9.399999999999999
              ISI <=  9.399999999999999 - Decision: RH <= 31.5
                RH <=  31.5 - Leaf, value: 3.78
                RH >  31.5 - Leaf, value: 0.058571428571428566
              ISI >  9.399999999999999 - Decision: temp <= 19.950000000000003
                temp <=  19.950000000000003 - Leaf, value: 7.52
                temp >  19.950000000000003 - Decision: X <= 5.5
                  X <=  5.5 - Leaf, value: 0.79
                  X >  5.5 - Leaf, value: 1.8299999999999998
      Y >  4.5 - Decision: temp <= 5.15
        te

### Tree Traversal ###

In [63]:
import pandas as pd
test_instance = pd.DataFrame(test_features.iloc[5],index=None).T

In [64]:
test_instance

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain
356,4,4,sep,fri,92.1,99.0,745.3,9.6,20.8,35,4.9,0.0


In [65]:
decision_tree.predict_verbose(test_instance)

Decision node at depth 0: DMC <= 103.1?
Yes, proceed to left child...
  Decision node at depth 1: temp <= 4.85?
  No, proceed to right child...
    Decision node at depth 2: Y <= 4.5?
    Yes, proceed to left child...
      Decision node at depth 3: X <= 3.5?
      No, proceed to right child...
        Decision node at depth 4: DMC <= 49.3?
        No, proceed to right child...
          Decision node at depth 5: FFMC <= 84.6?
          No, proceed to right child...
            Decision node at depth 6: ISI <= 9.399999999999999?
            No, proceed to right child...
              Decision node at depth 7: temp <= 19.950000000000003?
              No, proceed to right child...
                Decision node at depth 8: X <= 5.5?
                Yes, proceed to left child...
                  Reached leaf node with prediction: 0.79




356    Decision node at depth 0: DMC <= 103.1?\nYes, ...
dtype: object