In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification
from data_configs.configs import *
import statistics
import numpy as np

config = car_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
knn_model = KNN(config)
null_model = NullModelClassification(config=config)

### Data Load and Preprocessing ###

In [2]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [3]:
data_3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,0,0,0,0,0,0,unacc
1,0,0,0,0,0,1,unacc
2,0,0,0,0,0,2,unacc
3,0,0,0,0,1,0,unacc
4,0,0,0,0,1,1,unacc
...,...,...,...,...,...,...,...
1723,3,3,3,2,1,1,good
1724,3,3,3,2,1,2,vgood
1725,3,3,3,2,2,0,unacc
1726,3,3,3,2,2,1,good


## KNN Model ##

### Hyperparameter Tuning ###

In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [8]:
features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']

#### Tuning k ####

In [9]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        data_val_stand = data_processor.standardize_data(train_set_1, data_val, features=features)
        data_train_stand = data_processor.standardize_data(train_set_1,train_set_1, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.11040462427745665
Average 0-1 Loss score with k=2: 0.12919075144508668
Average 0-1 Loss score with k=3: 0.09479768786127167
Average 0-1 Loss score with k=4: 0.09277456647398843
Average 0-1 Loss score with k=5: 0.07919075144508671
Average 0-1 Loss score with k=6: 0.0907514450867052
Average 0-1 Loss score with k=7: 0.08005780346820808
Average 0-1 Loss score with k=8: 0.08583815028901734
Average 0-1 Loss score with k=9: 0.09017341040462427
Average 0-1 Loss score with k=10: 0.08815028901734104
Average 0-1 Loss score with k=11: 0.09219653179190752
Average 0-1 Loss score with k=12: 0.09797687861271676
Average 0-1 Loss score with k=13: 0.09710982658959537
Average 0-1 Loss score with k=14: 0.09421965317919075
Best k is 5 with the lowest average 0-1 loss score of 0.07919075144508671


### Model Performance ###

In [12]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    data_test_stand = data_processor.standardize_data(train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(train_set,train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_score = Evaluation.zero_one_loss(test_set[config['target_column']],null_model_prediction)
    null_model_scores.append(null_model_score) 


average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.2945007235890015
Average KNN 0-1 score for k=5: 0.06960926193921851
Average Precision score for k=5: 0.9306892571715286
Average Recall score for k=5: 0.9303907380607817
Average F1 score for k=5: 0.9282884220566763


## Edited KNN ##

### Hyperparameter Tuning ###

#### Tuning k ####

In [16]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        edited_train_set = knn_model.edited_knn_classificaton(train_set_1)

        data_val_stand = data_processor.standardize_data(edited_train_set, data_val, features=features)
        data_train_stand = data_processor.standardize_data(edited_train_set,edited_train_set, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 loss score with k=1: 0.16647398843930636
Average 0-1 loss score with k=2: 0.18352601156069365
Average 0-1 loss score with k=3: 0.1644508670520231
Average 0-1 loss score with k=4: 0.16271676300578036
Average 0-1 loss score with k=5: 0.16965317919075146
Average 0-1 loss score with k=6: 0.17630057803468208
Average 0-1 loss score with k=7: 0.17427745664739885
Average 0-1 loss score with k=8: 0.17254335260115605
Average 0-1 loss score with k=9: 0.17890173410404622


In [None]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    edited_train_set = knn_model.edited_knn_classificaton(train_set)

    data_test_stand = data_processor.standardize_data(edited_train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(edited_train_set,edited_train_set, features=features)

    predictions_1 = knn_model.knn_classifier(test_set, data_train_stand, k=best_k)['Predicted Class']

    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_score = Evaluation.zero_one_loss(test_set[config['target_column']],null_model_prediction)
    null_model_scores.append(null_model_score) 

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

print(f"Average null model 0-1 score: {average_null_model_score}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")

Average null model 0-1 score: 0.35062724014336916
Average KNN 0-1 score for k=9: 0.040788530465949824
Average Precision score for k=9: 0.9594643546065822
Average Recall score for k=9: 0.9592114695340502
Average F1 score for k=9: 0.9590275641565421


## Condensed Knn ##

#### Tuning k ####

In [None]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        condesed_train_set = knn_model.condensed_knn_classification(train_set_1)

        data_val_stand = data_processor.standardize_data(condesed_train_set, data_val, features=features)
        data_train_stand = data_processor.standardize_data(condesed_train_set,condesed_train_set, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.07714285714285714
Average 0-1 Loss score with k=2: 0.155
Average 0-1 Loss score with k=3: 0.05714285714285714
Average 0-1 Loss score with k=4: 0.08714285714285713
Average 0-1 Loss score with k=5: 0.05071428571428571
Average 0-1 Loss score with k=6: 0.05142857142857142
Average 0-1 Loss score with k=7: 0.042142857142857135
Average 0-1 Loss score with k=8: 0.05
Average 0-1 Loss score with k=9: 0.07928571428571428
Average 0-1 Loss score with k=10: 0.04357142857142857
Average 0-1 Loss score with k=11: 0.14785714285714285
Average 0-1 Loss score with k=12: 0.08357142857142856
Average 0-1 Loss score with k=13: 0.22142857142857145
Average 0-1 Loss score with k=14: 0.22214285714285714
Best k is 7 with the lowest average 0-1 loss score of 0.042142857142857135


### Model Performance ###

In [None]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    condesed_train_set = knn_model.condensed_knn_classification(train_set)

    data_test_stand = data_processor.standardize_data(condesed_train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(condesed_train_set,condesed_train_set, features=features)

    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']

    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_score = Evaluation.zero_one_loss(test_set[config['target_column']],null_model_prediction)
    null_model_scores.append(null_model_score) 

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

print(f"Average null model 0-1 score: {average_null_model_score}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")

Average null model 0-1 score: 0.35062724014336916
Average KNN 0-1 score for k=7: 0.07698412698412699
Average Precision score for k=7: 0.9405529232164694
Average Recall score for k=7: 0.9230158730158731
Average F1 score for k=7: 0.9221589014291244
