In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification
from data_configs.configs import *
import statistics
import numpy as np

config = breast_cancer_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
knn_model = KNN(config)
null_model = NullModelClassification(config=config)

### Data Load and Preprocessing ###

In [2]:
raw_data = data_processor.load_data()

raw_data_2 = raw_data.drop(columns=['Sample code number'])

data_1 = data_processor.impute_missing_values(raw_data_2)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [3]:
data_3

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,2
695,2,1,1,1,2,1.0,1,1,1,2
696,5,10,10,3,7,3.0,8,10,2,4
697,4,8,6,4,3,4.0,10,6,1,4


## KNN Model ##

### Hyperparameter Tuning ###

In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

#### Tuning k ####

In [5]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        predictions_1 = knn_model.knn_classifier(data_val, train_set_1, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.05428571428571429
Average 0-1 Loss score with k=2: 0.08
Average 0-1 Loss score with k=3: 0.03357142857142857
Average 0-1 Loss score with k=4: 0.045714285714285714
Average 0-1 Loss score with k=5: 0.03142857142857143
Average 0-1 Loss score with k=6: 0.03571428571428571
Average 0-1 Loss score with k=7: 0.02785714285714285
Average 0-1 Loss score with k=8: 0.03
Average 0-1 Loss score with k=9: 0.027142857142857142
Average 0-1 Loss score with k=10: 0.03
Average 0-1 Loss score with k=11: 0.02857142857142857
Average 0-1 Loss score with k=12: 0.03142857142857143
Average 0-1 Loss score with k=13: 0.029285714285714283
Average 0-1 Loss score with k=14: 0.03357142857142857
Best k is 9 with the lowest average 0-1 loss score of 0.027142857142857142


### Model Performance ###

In [6]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Additional lists for null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(test_set, train_set, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(test_set[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(test_set[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(test_set[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(test_set[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Null model predictions and metrics calculation
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

# Calculate averages for all metrics
average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

# Additional averages for null model metrics
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

# Print out all average scores
print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.35062724014336916
Average null model Precision score: 0.421685374674015
Average null model Recall score: 0.6493727598566308
Average null model F1 score: 0.5113278649043025
Average KNN 0-1 score for k=9: 0.03577956989247312
Average Precision score for k=9: 0.9645357411379856
Average Recall score for k=9: 0.9642204301075268
Average F1 score for k=9: 0.9641369555377652


## Edited KNN ##

### Hyperparameter Tuning ###

#### Tuning k ####

In [7]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        edited_train_set = knn_model.edited_knn_classification(train_set_1, train_set_2)

        predictions_1 = knn_model.knn_classifier(data_val, edited_train_set, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 loss score with k=1: 0.034999999999999996
Average 0-1 loss score with k=2: 0.04928571428571428
Average 0-1 loss score with k=3: 0.029285714285714283
Average 0-1 loss score with k=4: 0.03357142857142857
Average 0-1 loss score with k=5: 0.029285714285714286
Average 0-1 loss score with k=6: 0.02857142857142857
Average 0-1 loss score with k=7: 0.027857142857142858
Average 0-1 loss score with k=8: 0.02857142857142857
Average 0-1 loss score with k=9: 0.02785714285714285
Average 0-1 loss score with k=10: 0.03214285714285715
Average 0-1 loss score with k=11: 0.03214285714285715
Average 0-1 loss score with k=12: 0.03642857142857143
Average 0-1 loss score with k=13: 0.03428571428571429
Average 0-1 loss score with k=14: 0.037142857142857144
Best k is 9 with the lowest average 0-1 loss score of 0.02785714285714285


In [8]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Additional lists for null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    edited_train_set = knn_model.edited_knn_classification(train_set, data_val)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(test_set, edited_train_set, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(test_set[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(test_set[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(test_set[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(test_set[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Null model predictions and metrics calculation
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

# Calculate averages for all metrics
average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

# Additional averages for null model metrics
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

# Print out all average scores
print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.35062724014336916
Average null model Precision score: 0.421685374674015
Average null model Recall score: 0.6493727598566308
Average null model F1 score: 0.5113278649043025
Average KNN 0-1 score for k=9: 0.04078981054787506
Average Precision score for k=9: 0.959379852134522
Average Recall score for k=9: 0.959210189452125
Average F1 score for k=9: 0.9590563638775824


## Condensed Knn ##

#### Tuning k ####

In [9]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        condesed_train_set = knn_model.condensed_knn_classification(train_set_1)
        predictions_1 = knn_model.knn_classifier(data_val, condesed_train_set, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.07642857142857142
Average 0-1 Loss score with k=2: 0.15285714285714286
Average 0-1 Loss score with k=3: 0.05285714285714286
Average 0-1 Loss score with k=4: 0.08785714285714284
Average 0-1 Loss score with k=5: 0.049999999999999996
Average 0-1 Loss score with k=6: 0.05928571428571429
Average 0-1 Loss score with k=7: 0.04285714285714286
Average 0-1 Loss score with k=8: 0.045
Average 0-1 Loss score with k=9: 0.07999999999999999
Average 0-1 Loss score with k=10: 0.05
Average 0-1 Loss score with k=11: 0.085
Average 0-1 Loss score with k=12: 0.10642857142857143
Average 0-1 Loss score with k=13: 0.21500000000000002
Average 0-1 Loss score with k=14: 0.09714285714285714
Best k is 7 with the lowest average 0-1 loss score of 0.04285714285714286


### Model Performance ###

In [11]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Additional lists for null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    condesed_train_set = knn_model.condensed_knn_classification(train_set)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(test_set, condesed_train_set, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(test_set[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(test_set[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(test_set[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(test_set[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Null model predictions and metrics calculation
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

# Calculate averages for all metrics
average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)

# Additional averages for null model metrics
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

# Print out all average scores
print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.35062724014336916
Average null model Precision score: 0.421685374674015
Average null model Recall score: 0.6493727598566308
Average null model F1 score: 0.5113278649043025
Average KNN 0-1 score for k=7: 0.03899385560675883
Average Precision score for k=7: 0.9615396077888876
Average Recall score for k=7: 0.9610061443932413
Average F1 score for k=7: 0.960945312796736
