In [2]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification
from data_configs.configs import *
import statistics
import numpy as np

config = car_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
knn_model = KNN(config)
null_model = NullModelClassification(config=config)

### Data Load and Preprocessing ###

In [3]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

## KNN Model ##

### Hyperparameter Tuning ###

In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [5]:
features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']

#### Tuning k ####

In [6]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        data_val_stand = data_processor.standardize_data(train_set_1, data_val, features=features)
        data_train_stand = data_processor.standardize_data(train_set_1,train_set_1, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.11040462427745665
Average 0-1 Loss score with k=2: 0.1378612716763006
Average 0-1 Loss score with k=3: 0.09971098265895954
Average 0-1 Loss score with k=4: 0.11156069364161851
Average 0-1 Loss score with k=5: 0.08786127167630058
Average 0-1 Loss score with k=6: 0.10144508670520232
Average 0-1 Loss score with k=7: 0.08728323699421967
Average 0-1 Loss score with k=8: 0.09826589595375722
Average 0-1 Loss score with k=9: 0.09277456647398845
Average 0-1 Loss score with k=10: 0.10028901734104045
Average 0-1 Loss score with k=11: 0.09682080924855492
Average 0-1 Loss score with k=12: 0.10520231213872833
Average 0-1 Loss score with k=13: 0.10057803468208093
Average 0-1 Loss score with k=14: 0.10606936416184971
Best k is 7 with the lowest average 0-1 loss score of 0.08728323699421967


### Model Performance ###

In [7]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Lists to store null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    data_test_stand = data_processor.standardize_data(train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(train_set,train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Evaluate null model
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.2945007235890015
Average null model Precision score: 0.4977297525974856
Average null model Recall score: 0.7054992764109985
Average null model F1 score: 0.5836758958553583
Average KNN 0-1 score for k=7: 0.08147612156295223
Average Precision score for k=7: 0.9186614107699201
Average Recall score for k=7: 0.9185238784370477
Average F1 score for k=7: 0.9150292590723765


## Edited KNN ##

### Hyperparameter Tuning ###

#### Tuning k ####

In [8]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        edited_train_set = knn_model.edited_knn_classification(train_set_1,train_set_2)

        data_val_stand = data_processor.standardize_data(edited_train_set, data_val, features=features)
        data_train_stand = data_processor.standardize_data(edited_train_set,edited_train_set, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 loss score with k=1: 0.1508670520231214
Average 0-1 loss score with k=2: 0.18583815028901735
Average 0-1 loss score with k=3: 0.1468208092485549
Average 0-1 loss score with k=4: 0.15895953757225434
Average 0-1 loss score with k=5: 0.1468208092485549
Average 0-1 loss score with k=6: 0.15809248554913297
Average 0-1 loss score with k=7: 0.15115606936416187
Average 0-1 loss score with k=8: 0.1609826589595376
Average 0-1 loss score with k=9: 0.1598265895953757
Average 0-1 loss score with k=10: 0.16705202312138728
Average 0-1 loss score with k=11: 0.1624277456647399
Average 0-1 loss score with k=12: 0.16878612716763006
Average 0-1 loss score with k=13: 0.16734104046242776
Average 0-1 loss score with k=14: 0.17890173410404625
Best k is 3 with the lowest average 0-1 loss score of 0.1468208092485549


In [9]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Lists to store null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    edited_train_set = knn_model.edited_knn_classification(train_set,data_val)

    data_test_stand = data_processor.standardize_data(edited_train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(edited_train_set,edited_train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Evaluate null model
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.2945007235890015
Average null model Precision score: 0.4977297525974856
Average null model Recall score: 0.7054992764109985
Average null model F1 score: 0.5836758958553583
Average KNN 0-1 score for k=3: 0.13270622286541242
Average Precision score for k=3: 0.8594663859173541
Average Recall score for k=3: 0.8672937771345876
Average F1 score for k=3: 0.8559306983157198


## Condensed Knn ##

#### Tuning k ####

In [10]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        condesed_train_set = knn_model.condensed_knn_classification(train_set_1)

        data_val_stand = data_processor.standardize_data(condesed_train_set, data_val, features=features)
        data_train_stand = data_processor.standardize_data(condesed_train_set,condesed_train_set, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.11358381502890173
Average 0-1 Loss score with k=2: 0.13121387283236993
Average 0-1 Loss score with k=3: 0.1023121387283237
Average 0-1 Loss score with k=4: 0.10173410404624277
Average 0-1 Loss score with k=5: 0.0869942196531792
Average 0-1 Loss score with k=6: 0.08930635838150287
Average 0-1 Loss score with k=7: 0.0838150289017341
Average 0-1 Loss score with k=8: 0.08583815028901734
Average 0-1 Loss score with k=9: 0.08583815028901735
Average 0-1 Loss score with k=10: 0.08930635838150289
Average 0-1 Loss score with k=11: 0.0959537572254335
Average 0-1 Loss score with k=12: 0.09624277456647398
Average 0-1 Loss score with k=13: 0.10346820809248555
Average 0-1 Loss score with k=14: 0.09104046242774567
Best k is 7 with the lowest average 0-1 loss score of 0.0838150289017341


### Model Performance ###

In [11]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Lists to store null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    condesed_train_set = knn_model.condensed_knn_classification(train_set)

    data_test_stand = data_processor.standardize_data(condesed_train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(condesed_train_set,condesed_train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Evaluate null model
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.2945007235890015
Average null model Precision score: 0.4977297525974856
Average null model Recall score: 0.7054992764109985
Average null model F1 score: 0.5836758958553583
Average KNN 0-1 score for k=7: 0.08234442836468886
Average Precision score for k=7: 0.9211448749155886
Average Recall score for k=7: 0.9176555716353111
Average F1 score for k=7: 0.9167392510425749
