In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification
from data_configs.configs import *
import statistics
import numpy as np

config = house_votes_84_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
knn_model = KNN(config)
null_model = NullModelClassification(config=config)

### Data Load and Preprocessing ###

In [2]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [3]:
data_3

Unnamed: 0,Class Name,handicapped-infants_?,handicapped-infants_n,handicapped-infants_y,water-project-cost-sharing_?,water-project-cost-sharing_n,water-project-cost-sharing_y,adoption-of-the-budget-resolution_?,adoption-of-the-budget-resolution_n,adoption-of-the-budget-resolution_y,...,superfund-right-to-sue_y,crime_?,crime_n,crime_y,duty-free-exports_?,duty-free-exports_n,duty-free-exports_y,export-administration-act-south-africa_?,export-administration-act-south-africa_n,export-administration-act-south-africa_y
0,republican,0,1,0,0,0,1,0,1,0,...,1,0,0,1,0,1,0,0,0,1
1,republican,0,1,0,0,0,1,0,1,0,...,1,0,0,1,0,1,0,1,0,0
2,democrat,1,0,0,0,0,1,0,0,1,...,1,0,0,1,0,1,0,0,1,0
3,democrat,0,1,0,0,0,1,0,0,1,...,1,0,1,0,0,1,0,0,0,1
4,democrat,0,0,1,0,0,1,0,0,1,...,1,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,republican,0,1,0,0,1,0,0,0,1,...,1,0,0,1,0,1,0,0,0,1
431,democrat,0,1,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,0,1
432,republican,0,1,0,1,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
433,republican,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


## KNN Model ##

### Hyperparameter Tuning ###

In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

#### Tuning k ####

In [5]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        predictions_1 = knn_model.knn_classifier(data_val, train_set_1, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.09310344827586207
Average 0-1 Loss score with k=2: 0.09080459770114943
Average 0-1 Loss score with k=3: 0.0954022988505747
Average 0-1 Loss score with k=4: 0.09770114942528738
Average 0-1 Loss score with k=5: 0.09885057471264366
Average 0-1 Loss score with k=6: 0.10459770114942528
Average 0-1 Loss score with k=7: 0.10229885057471262
Average 0-1 Loss score with k=8: 0.10919540229885057
Average 0-1 Loss score with k=9: 0.10919540229885057
Average 0-1 Loss score with k=10: 0.11609195402298851
Average 0-1 Loss score with k=11: 0.11609195402298851
Average 0-1 Loss score with k=12: 0.11839080459770115
Average 0-1 Loss score with k=13: 0.11609195402298851
Average 0-1 Loss score with k=14: 0.12068965517241378
Best k is 2 with the lowest average 0-1 loss score of 0.09080459770114943


### Model Performance ###

In [6]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Additional lists for null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(test_set, train_set, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(test_set[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(test_set[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(test_set[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(test_set[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Null model predictions and metrics calculation
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

# Calculate averages for all metrics
average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

# Additional averages for null model metrics
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

# Print out all average scores
print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.3936781609195403
Average null model Precision score: 0.36763442991148104
Average null model Recall score: 0.6063218390804598
Average null model F1 score: 0.4577281635679856
Average KNN 0-1 score for k=2: 0.08563218390804597
Average Precision score for k=2: 0.9250278367475395
Average Recall score for k=2: 0.914367816091954
Average F1 score for k=2: 0.9152225678491709


## Edited KNN ##

### Hyperparameter Tuning ###

#### Tuning k ####

In [7]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        edited_train_set = knn_model.edited_knn_classificaton(train_set_1)

        predictions_1 = knn_model.knn_classifier(data_val, edited_train_set, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 loss score with k=1: 0.09770114942528735
Average 0-1 loss score with k=2: 0.1057471264367816
Average 0-1 loss score with k=3: 0.11034482758620687
Average 0-1 loss score with k=4: 0.11264367816091954
Average 0-1 loss score with k=5: 0.11379310344827584
Average 0-1 loss score with k=6: 0.11724137931034487
Average 0-1 loss score with k=7: 0.11609195402298851
Average 0-1 loss score with k=8: 0.12413793103448274
Average 0-1 loss score with k=9: 0.12298850574712643
Average 0-1 loss score with k=10: 0.1298850574712644
Average 0-1 loss score with k=11: 0.13218390804597702
Average 0-1 loss score with k=12: 0.13218390804597702
Average 0-1 loss score with k=13: 0.13103448275862067
Average 0-1 loss score with k=14: 0.13448275862068965
Best k is 1 with the lowest average 0-1 loss score of 0.09770114942528735


In [8]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Additional lists for null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    edited_train_set = knn_model.edited_knn_classificaton(train_set)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(test_set, edited_train_set, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(test_set[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(test_set[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(test_set[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(test_set[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Null model predictions and metrics calculation
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

# Calculate averages for all metrics
average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

# Additional averages for null model metrics
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

# Print out all average scores
print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.3936781609195403
Average null model Precision score: 0.36763442991148104
Average null model Recall score: 0.6063218390804598
Average null model F1 score: 0.4577281635679856
Average KNN 0-1 score for k=1: 0.08505747126436781
Average Precision score for k=1: 0.9243754819665047
Average Recall score for k=1: 0.9149425287356323
Average F1 score for k=1: 0.9157574909387396


## Condensed Knn ##

#### Tuning k ####

In [9]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        condesed_train_set = knn_model.condensed_knn_classification(train_set_1)
        predictions_1 = knn_model.knn_classifier(data_val, condesed_train_set, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.11724137931034484
Average 0-1 Loss score with k=2: 0.13793103448275862
Average 0-1 Loss score with k=3: 0.11494252873563218
Average 0-1 Loss score with k=4: 0.10919540229885054
Average 0-1 Loss score with k=5: 0.0781609195402299
Average 0-1 Loss score with k=6: 0.09770114942528736
Average 0-1 Loss score with k=7: 0.08390804597701149
Average 0-1 Loss score with k=8: 0.11379310344827587
Average 0-1 Loss score with k=9: 0.08275862068965517
Average 0-1 Loss score with k=10: 0.08735632183908046
Average 0-1 Loss score with k=11: 0.09310344827586207
Average 0-1 Loss score with k=12: 0.07931034482758621
Average 0-1 Loss score with k=13: 0.09310344827586207
Average 0-1 Loss score with k=14: 0.08735632183908046
Best k is 5 with the lowest average 0-1 loss score of 0.0781609195402299


### Model Performance ###

In [10]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Additional lists for null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    condesed_train_set = knn_model.edited_knn_classificaton(train_set)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(test_set, condesed_train_set, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(test_set[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(test_set[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(test_set[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(test_set[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Null model predictions and metrics calculation
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

# Calculate averages for all metrics
average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

# Additional averages for null model metrics
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

# Print out all average scores
print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.3936781609195403
Average null model Precision score: 0.36763442991148104
Average null model Recall score: 0.6063218390804598
Average null model F1 score: 0.4577281635679856
Average KNN 0-1 score for k=5: 0.09022988505747127
Average Precision score for k=5: 0.9196116731878424
Average Recall score for k=5: 0.9097701149425287
Average F1 score for k=5: 0.9106417427261793
