In [1]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.decision_tree import DecisionTree, DecisionTreeNode
from models.null_model import NullModelClassification, NullModelRegression
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
import numpy as np

config = forest_fires_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
null_model = NullModelClassification(config=config)

### Data Load and Preprocessing ###

In [2]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

In [3]:
data_1.describe()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


In [4]:
data_train, data_val = cross_validator.random_partition(data_1, random_state=42)

## Decision Tree Performance ##

In [5]:
# Initialize lists to store scores for decision tree, pruned decision tree, and null model
dt_scores = {'mse': [], 'mae': [], 'r2': [], 'pearson_correlation': []}
pruned_dt_scores = {'mse': [], 'mae': [], 'r2': [], 'pearson_correlation': []}
null_model_scores = {'mse': [], 'mae': []}  

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
    train_data = train_set.drop(columns=config['target_column'])
    train_target = train_set[config['target_column']]
    test_features = test_set.drop(columns=config['target_column'])
    test_true_vals = test_set[config['target_column']]

    # Decision Tree Model
    decision_tree = DecisionTree(config, data_1)
    decision_tree.root = decision_tree.build_regression_tree(train_data, train_target)
    predictions = decision_tree.predict(test_features)

    # Calculate and store decision tree scores
    scores = Evaluation.calculate_regression_scores(test_true_vals, predictions)
    for key in dt_scores:
        dt_scores[key].append(scores[key])

    # Pruning the Decision Tree
    decision_tree.prune(decision_tree.root, data_val)  # Ensure data_val is correctly defined as your validation set
    pruned_predictions = decision_tree.predict(test_features)

    # Calculate and store pruned decision tree scores
    pruned_scores = Evaluation.calculate_regression_scores(test_true_vals, pruned_predictions)
    for key in pruned_dt_scores:
        pruned_dt_scores[key].append(pruned_scores[key])

    # Null Model
    null_model = NullModelRegression(config=config)
    null_model_prediction = null_model.naive_regression(test_set)

    # Calculate and store null model scores (excluding R2 and Pearson)
    null_model_mse = Evaluation.mean_squared_error(test_true_vals, null_model_prediction)
    null_model_mae = Evaluation.mean_absolute_error(test_true_vals, null_model_prediction)
    null_model_scores['mse'].append(null_model_mse)
    null_model_scores['mae'].append(null_model_mae)

# Calculate average scores for each model
average_dt_scores = {metric: np.mean(values) for metric, values in dt_scores.items()}
average_pruned_dt_scores = {metric: np.mean(values) for metric, values in pruned_dt_scores.items()}
average_null_model_scores = {metric: np.mean(values) for metric, values in null_model_scores.items()}

# Print average scores
print("Average Decision Tree Scores:")
for metric, avg_score in average_dt_scores.items():
    print(f"{metric}: {avg_score}")

print("\nAverage Pruned Decision Tree Scores:")
for metric, avg_score in average_pruned_dt_scores.items():
    print(f"{metric}: {avg_score}")

print("\nAverage Null Model Scores:")
for metric, avg_score in average_null_model_scores.items():
    print(f"{metric}: {avg_score}")




Average Decision Tree Scores:
mse: 4954.857207522884
mae: 21.56212607488544
r2: -3.514351152482527
pearson_correlation: -0.010177619319059712

Average Pruned Decision Tree Scores:
mse: 3100.305593026007
mae: 14.337196396605403
r2: -1.6186864370596648
pearson_correlation: nan

Average Null Model Scores:
mse: 2071.486896000717
mae: 15.705907169755505
