In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification
from data_configs.configs import *
import statistics
import numpy as np

config = car_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
knn_model = KNN(config)
null_model = NullModelClassification(config=config)

### Data Load and Preprocessing ###

In [2]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [3]:
data_3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,0,0,0,0,0,0,unacc
1,0,0,0,0,0,1,unacc
2,0,0,0,0,0,2,unacc
3,0,0,0,0,1,0,unacc
4,0,0,0,0,1,1,unacc
...,...,...,...,...,...,...,...
1723,3,3,3,2,1,1,good
1724,3,3,3,2,1,2,vgood
1725,3,3,3,2,2,0,unacc
1726,3,3,3,2,2,1,good


## KNN Model ##

### Hyperparameter Tuning ###

In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [5]:
features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']

#### Tuning k ####

In [6]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        data_val_stand = data_processor.standardize_data(train_set_1, data_val, features=features)
        data_train_stand = data_processor.standardize_data(train_set_1,train_set_1, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.11040462427745665
Average 0-1 Loss score with k=2: 0.12283236994219653
Average 0-1 Loss score with k=3: 0.08554913294797688
Average 0-1 Loss score with k=4: 0.09190751445086706
Average 0-1 Loss score with k=5: 0.07658959537572255
Average 0-1 Loss score with k=6: 0.0861271676300578
Average 0-1 Loss score with k=7: 0.07976878612716762
Average 0-1 Loss score with k=8: 0.08439306358381501
Average 0-1 Loss score with k=9: 0.08699421965317918
Average 0-1 Loss score with k=10: 0.08439306358381501
Average 0-1 Loss score with k=11: 0.09132947976878612
Average 0-1 Loss score with k=12: 0.09421965317919076
Average 0-1 Loss score with k=13: 0.09479768786127168
Average 0-1 Loss score with k=14: 0.09190751445086705
Best k is 5 with the lowest average 0-1 loss score of 0.07658959537572255


### Model Performance ###

In [7]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Lists to store null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    data_test_stand = data_processor.standardize_data(train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(train_set,train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Evaluate null model
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.2945007235890015
Average null model Precision score: 0.4977297525974856
Average null model Recall score: 0.7054992764109985
Average null model F1 score: 0.5836758958553583
Average KNN 0-1 score for k=5: 0.06816208393632418
Average Precision score for k=5: 0.9325392039311449
Average Recall score for k=5: 0.9318379160636759
Average F1 score for k=5: 0.9296356367983849


## Edited KNN ##

### Hyperparameter Tuning ###

#### Tuning k ####

In [8]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        edited_train_set = knn_model.edited_knn_classificaton(train_set_1)

        data_val_stand = data_processor.standardize_data(edited_train_set, data_val, features=features)
        data_train_stand = data_processor.standardize_data(edited_train_set,edited_train_set, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 loss score with k=1: 0.16647398843930636
Average 0-1 loss score with k=2: 0.18150289017341042
Average 0-1 loss score with k=3: 0.1598265895953757
Average 0-1 loss score with k=4: 0.16271676300578033
Average 0-1 loss score with k=5: 0.1676300578034682
Average 0-1 loss score with k=6: 0.17283236994219653
Average 0-1 loss score with k=7: 0.17254335260115608
Average 0-1 loss score with k=8: 0.1713872832369942
Average 0-1 loss score with k=9: 0.17803468208092485
Average 0-1 loss score with k=10: 0.17947976878612718
Average 0-1 loss score with k=11: 0.18612716763005782
Average 0-1 loss score with k=12: 0.18670520231213875
Average 0-1 loss score with k=13: 0.1921965317919075
Average 0-1 loss score with k=14: 0.1945086705202312
Best k is 3 with the lowest average 0-1 loss score of 0.1598265895953757


In [9]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Lists to store null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    edited_train_set = knn_model.edited_knn_classificaton(train_set)

    data_test_stand = data_processor.standardize_data(edited_train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(edited_train_set,edited_train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Evaluate null model
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.2945007235890015
Average null model Precision score: 0.4977297525974856
Average null model Recall score: 0.7054992764109985
Average null model F1 score: 0.5836758958553583
Average KNN 0-1 score for k=3: 0.14037626628075256
Average Precision score for k=3: 0.8467414337010151
Average Recall score for k=3: 0.8596237337192475
Average F1 score for k=3: 0.8435838341675639


## Condensed Knn ##

#### Tuning k ####

In [10]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        condesed_train_set = knn_model.condensed_knn_classification(train_set_1)

        data_val_stand = data_processor.standardize_data(condesed_train_set, data_val, features=features)
        data_train_stand = data_processor.standardize_data(condesed_train_set,condesed_train_set, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.1121387283236994
Average 0-1 Loss score with k=2: 0.1283236994219653
Average 0-1 Loss score with k=3: 0.09277456647398843
Average 0-1 Loss score with k=4: 0.10867052023121386
Average 0-1 Loss score with k=5: 0.08092485549132947
Average 0-1 Loss score with k=6: 0.09132947976878612
Average 0-1 Loss score with k=7: 0.08092485549132947
Average 0-1 Loss score with k=8: 0.08930635838150289
Average 0-1 Loss score with k=9: 0.08121387283236994
Average 0-1 Loss score with k=10: 0.09364161849710984
Average 0-1 Loss score with k=11: 0.08901734104046244
Average 0-1 Loss score with k=12: 0.10057803468208093
Average 0-1 Loss score with k=13: 0.09421965317919076
Average 0-1 Loss score with k=14: 0.10173410404624277
Best k is 5 with the lowest average 0-1 loss score of 0.08092485549132947


### Model Performance ###

In [11]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Lists to store null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    condesed_train_set = knn_model.condensed_knn_classification(train_set)

    data_test_stand = data_processor.standardize_data(condesed_train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(condesed_train_set,condesed_train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Evaluate null model
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.2945007235890015
Average null model Precision score: 0.4977297525974856
Average null model Recall score: 0.7054992764109985
Average null model F1 score: 0.5836758958553583
Average KNN 0-1 score for k=5: 0.07337192474674384
Average Precision score for k=5: 0.9315580120665174
Average Recall score for k=5: 0.9266280752532563
Average F1 score for k=5: 0.9265942947019177
