In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification, NullModelRegression
from data_configs.configs import *
import statistics

config = breast_cancer_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
classification_nullmodel = NullModelClassification(config=config)
regression_nullmodel = NullModelRegression(config=config)
knn_model = KNN(config)

In [3]:
# Data Processing

raw_data = data_processor.load_data()

raw_data_2 = raw_data.drop(columns=['Sample code number'])

data_1 = data_processor.impute_missing_values(raw_data_2)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [5]:
len(data_train)

559

In [6]:
edited_train = knn_model.edited_knn_classificaton(data_train)
edited_train

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
82,5,2,1,1,2,1.0,3,1,1,2
51,5,3,3,4,2,4.0,3,4,1,4
220,1,1,1,2,2,1.0,3,1,1,2
559,5,1,1,1,2,1.0,2,1,1,2
544,2,1,3,2,2,1.0,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...
71,6,10,2,8,10,2.0,7,8,10,4
106,10,10,10,8,2,10.0,4,1,1,4
270,8,4,7,1,3,10.0,3,9,2,4
435,10,8,10,1,3,10.0,5,1,1,4


In [7]:
pred_class = knn_model.knn_classifier(data_val,edited_train,3)['Predicted Class']
Evaluation().zero_one_loss(data_val[config['target_column']],pred_class)

0.02142857142857143

In [8]:
Evaluation().precision(data_val[config['target_column']],pred_class)

0.9785579004329005

In [9]:
Evaluation().recall(data_val[config['target_column']],pred_class)

0.9785714285714285

In [10]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=True)):
        # Train and evaluate using train_set_1
        predictions_1 = knn_model.knn_classifier(data_val, train_set_1, k=k)['Predicted Class']
        score = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score)
    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")

Average score for k=1: 0.05214285714285714
Average score for k=2: 0.02071428571428572
Average score for k=3: 0.030714285714285715
Average score for k=4: 0.027857142857142858
Average score for k=5: 0.02857142857142857
Average score for k=6: 0.02642857142857143
Average score for k=7: 0.02642857142857143
Average score for k=8: 0.027142857142857146
Average score for k=9: 0.030714285714285715
Average score for k=10: 0.027857142857142858


In [11]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=True)):
        # Train and evaluate using train_set_1
        condensed_train = knn_model.condensed_knn_classification(train_set_1)
        predictions_1 = knn_model.knn_classifier(data_val, condensed_train, k=k)['Predicted Class']
        score = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score)
    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")

Average score for k=1: 0.08428571428571427


In [9]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=True)):
        # Train and evaluate using train_set_1
        edited_train = knn_model.edited_knn_classificaton(train_set_1)
        predictions_1 = knn_model.knn_classifier(data_val, edited_train, k=k)['Predicted Class']
        score = Evaluation().zero_one_loss(data_val[config['target_column']], predictions_1)
        scores.append(score)
    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")

Average score for k=1: 0.03571428571428571
Average score for k=2: 0.04357142857142858
Average score for k=3: 0.027857142857142858
Average score for k=4: 0.034999999999999996
Average score for k=5: 0.027857142857142858
Average score for k=6: 0.03142857142857143
Average score for k=7: 0.030000000000000006
Average score for k=8: 0.032857142857142856
Average score for k=9: 0.035
Average score for k=10: 0.03571428571428571


In [10]:
null_model = NullModelClassification(config)

null_model_results = null_model.naive_classifier(data_val)
Evaluation().zero_one_loss(data_val[config["target_column"]],null_model_results)

0.32142857142857145