In [1]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.decision_tree import DecisionTree, DecisionTreeNode
from models.null_model import NullModelClassification
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
import numpy as np

config = house_votes_84_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)

### Data Load and Preprocessing ###

In [2]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

In [3]:
data_1

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,republican,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y
431,democrat,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y
432,republican,n,?,n,y,y,y,n,n,n,n,y,y,y,y,n,y
433,republican,n,n,n,y,y,y,?,?,?,?,n,y,y,y,n,y


In [4]:
data_train, data_val = cross_validator.random_partition(data_1, random_state=42)

In [5]:
train_data = data_train.drop(columns=config['target_column'])
train_target = data_train[config['target_column']]
test_features = data_val.drop(columns=config['target_column'])
test_true_vals = data_val[config['target_column']]

# Decision Tree Model for classification
decision_tree = DecisionTree(config, data_1)
decision_tree.root = decision_tree.build_classification_tree(train_data, train_target)

In [6]:
predictions = decision_tree.predict(test_features)
Evaluation().zero_one_loss(test_true_vals,predictions)

0.04597701149425287

In [7]:
decision_tree.prune(decision_tree.root, data_val)  # Ensure data_val is correctly defined as your validation set
pruned_predictions = decision_tree.predict(test_features)
Evaluation().zero_one_loss(test_true_vals,pruned_predictions)

## Decision Tree Performance ##

In [8]:
# Initialize lists to store scores for decision tree, pruned decision tree, and null model
dt_scores = {'zero_one_loss': [], 'f1_score': [], 'precision': [], 'recall': []}
pruned_dt_scores = {'zero_one_loss': [], 'f1_score': [], 'precision': [], 'recall': []}
null_model_scores = {'zero_one_loss': [], 'f1_score': [], 'precision': [], 'recall': []}

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    train_data = train_set.drop(columns=config['target_column'])
    train_target = train_set[config['target_column']]
    test_features = test_set.drop(columns=config['target_column'])
    test_true_vals = test_set[config['target_column']]

    # Decision Tree Model for classification
    decision_tree = DecisionTree(config, data_1)
    decision_tree.root = decision_tree.build_classification_tree(train_data, train_target)
    predictions = decision_tree.predict(test_features)

    # Calculate and store decision tree scores
    scores = Evaluation.calculate_classification_scores(test_true_vals, predictions)
    for key in dt_scores:
        dt_scores[key].append(scores[key])

    # Pruning the Decision Tree
    decision_tree.prune(decision_tree.root, data_val)  # Ensure data_val is correctly defined as your validation set
    pruned_predictions = decision_tree.predict(test_features)

    # Calculate and store pruned decision tree scores
    pruned_scores = Evaluation.calculate_classification_scores(test_true_vals, pruned_predictions)
    for key in pruned_dt_scores:
        pruned_dt_scores[key].append(pruned_scores[key])

    # Null Model for classification
    null_model = NullModelClassification(config=config)
    null_model_prediction = null_model.naive_classifier(test_set)

    # Calculate and store null model scores
    null_scores = Evaluation.calculate_classification_scores(test_true_vals, null_model_prediction)
    for key in null_model_scores:
        null_model_scores[key].append(null_scores[key])

# Calculate average scores for each model
average_dt_scores = {metric: np.mean(values) for metric, values in dt_scores.items()}
average_pruned_dt_scores = {metric: np.mean(values) for metric, values in pruned_dt_scores.items()}
average_null_model_scores = {metric: np.mean(values) for metric, values in null_model_scores.items()}

# Print average scores
print("Average Decision Tree Scores:")
for metric, avg_score in average_dt_scores.items():
    print(f"{metric}: {avg_score}")

print("\nAverage Pruned Decision Tree Scores:")
for metric, avg_score in average_pruned_dt_scores.items():
    print(f"{metric}: {avg_score}")

print("\nAverage Null Model Scores:")
for metric, avg_score in average_null_model_scores.items():
    print(f"{metric}: {avg_score}")


Average Decision Tree Scores:
zero_one_loss: 0.07011494252873564
f1_score: 0.9300049932225104
precision: 0.9308400762745004
recall: 0.9298850574712642

Average Pruned Decision Tree Scores:
zero_one_loss: 0.04597701149425287
f1_score: 0.9541837668142469
precision: 0.9551353837522493
recall: 0.9540229885057471

Average Null Model Scores:
zero_one_loss: 0.3936781609195402
f1_score: 0.45772816356798557
precision: 0.36763442991148104
recall: 0.6063218390804598
