In [137]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification
from data_configs.configs import *
import numpy as np
import statistics


config = car_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
knn_model = KNN(config)
null_model = NullModelClassification(config=config)

## Data Preprocessing ##

In [138]:
raw_data = data_processor.load_data()

In [139]:
raw_data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


#### Impute Missing Values ####

In [140]:
data_1 = data_processor.impute_missing_values(raw_data)

In [141]:
null_values = raw_data.isnull().sum()
print(null_values)

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
Class       0
dtype: int64


In [142]:
null_values = data_1.isnull().sum()
print(null_values)

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
Class       0
dtype: int64


#### Encode Nominal and Ordinal Features ####

In [143]:
data_2 = data_processor.encode_nominal_features(data_1)
data_2

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [144]:
data_3 = data_processor.encode_ordinal_features(data_2)
data_3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,0,0,0,0,0,0,unacc
1,0,0,0,0,0,1,unacc
2,0,0,0,0,0,2,unacc
3,0,0,0,0,1,0,unacc
4,0,0,0,0,1,1,unacc
...,...,...,...,...,...,...,...
1723,3,3,3,2,1,1,good
1724,3,3,3,2,1,2,vgood
1725,3,3,3,2,2,0,unacc
1726,3,3,3,2,2,1,good


In [152]:
features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
# features=['Length', 'Diameter', 'Height', 'Whole weight','Shucked weight', 'Viscera weight', 'Shell weight']
data_standardized = data_processor.standardize_data(data_3,data_3, features=features)
data_standardized.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
count,1728.0,1728.0,1728.0,1728.0,1728.0,1728.0
mean,0.0,0.0,2.4671620000000003e-17,0.0,-1.027984e-17,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.341253,-1.341253,-1.341253,-1.22439,-1.22439,-1.22439
25%,-0.670626,-0.670626,-0.6706263,-1.22439,-1.22439,-1.22439
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.670626,0.670626,0.6706263,1.22439,1.22439,1.22439
max,1.341253,1.341253,1.341253,1.22439,1.22439,1.22439


## Cross Validation ##

In [146]:
data_train, data_val = cross_validator.random_partition(data_3,val_size=0.2)

In [147]:
print(len(data_train))
(len(data_val))

1382


346

In [148]:
for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):

    print(train_set)
    print(test_set)

      buying  maint  doors  persons  lug_boot  safety  Class
1474       3      1      2        1         2       1    acc
842        1      3      3        0         1       2  unacc
1481       3      1      2        2         1       2  vgood
393        0      3      2        1         2       0  unacc
615        1      1      2        2         1       0  unacc
...      ...    ...    ...      ...       ...     ...    ...
381        0      3      2        0         1       0  unacc
406        0      3      3        0         0       1  unacc
604        1      1      2        1         0       1  unacc
437        1      0      0        0         1       2  unacc
1032       2      1      2        0         2       0  unacc

[691 rows x 7 columns]
      buying  maint  doors  persons  lug_boot  safety  Class
104        0      0      3        2         1       2  unacc
182        0      1      2        2         0       2  unacc
1086       2      2      0        0         2       0  unacc


## Model Algorithm ##

#### Distance Calculation ####

In [125]:
data_val_point = data_val.iloc[1].values
data_val_point

array([ 0.525 ,  0.4   ,  0.14  ,  0.7325,  0.334 ,  0.1575,  0.17  ,
       11.    ,  0.    ,  0.    ,  1.    ])

In [126]:
data_train_points = data_train.values
data_train_points

array([[0.52 , 0.415, 0.145, ..., 1.   , 0.   , 0.   ],
       [0.585, 0.465, 0.165, ..., 1.   , 0.   , 0.   ],
       [0.3  , 0.22 , 0.065, ..., 0.   , 1.   , 0.   ],
       ...,
       [0.53 , 0.415, 0.11 , ..., 0.   , 1.   , 0.   ],
       [0.46 , 0.335, 0.11 , ..., 0.   , 1.   , 0.   ],
       [0.54 , 0.415, 0.13 , ..., 0.   , 0.   , 1.   ]])

In [127]:
knn_model.calc_distance(data_train_points, data_val_point)

array([1.73750144, 2.46319107, 6.21060629, ..., 2.45647578, 3.33390945,
       2.00560471])

#### Nearest Neighbors ####

In [128]:
knn_model.k_nearest_neighbors(data_val_point,data_train,k=5)

[(10.000037237430668, 8),
 (10.000101511984766, 10),
 (10.000102474474948, 9),
 (10.000110674387559, 9),
 (10.000122374251227, 10)]

#### KNN Classification ####

In [149]:
knn_model.knn_classifier(test_set,train_set,k=5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class,Predicted Class
1474,3,1,2,1,2,1,acc,acc
842,1,3,3,0,1,2,unacc,unacc
1481,3,1,2,2,1,2,vgood,vgood
615,1,1,2,2,1,0,unacc,unacc
1708,3,3,3,0,2,1,unacc,unacc
...,...,...,...,...,...,...,...,...
406,0,3,3,0,0,1,unacc,unacc
91,0,0,3,1,0,1,unacc,unacc
604,1,1,2,1,0,1,unacc,unacc
437,1,0,0,0,1,2,unacc,unacc


In [150]:
predictions = knn_model.knn_classifier(test_set,train_set,k=5)['Predicted Class']
zero_one_loss = Evaluation.zero_one_loss(test_set['Class'], predictions)
print(zero_one_loss)

0.0824891461649783


## KNN Regression ##

In [129]:
gamma = 1/(statistics.stdev(data_train[config['target_column']]))
knn_model.knn_regression(test_set,train_set,k=5,gamma=gamma)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M,Predicted Value
2295,0.520,0.415,0.145,0.8045,0.3325,0.1725,0.2850,10,1,0,0,13.799850
3106,0.300,0.220,0.065,0.1235,0.0590,0.0260,0.0315,5,0,1,0,6.200000
1351,0.595,0.465,0.155,1.0260,0.4645,0.1120,0.3050,12,1,0,0,9.799910
3163,0.550,0.440,0.125,0.7650,0.3300,0.2125,0.2450,9,1,0,0,12.400560
3803,0.320,0.230,0.060,0.1290,0.0615,0.0275,0.0355,7,0,1,0,6.399982
...,...,...,...,...,...,...,...,...,...,...,...,...
2769,0.565,0.430,0.150,0.8215,0.3320,0.1685,0.2900,11,0,1,0,9.800156
2513,0.465,0.350,0.135,0.6265,0.2590,0.1445,0.1750,8,1,0,0,12.599989
1299,0.530,0.415,0.110,0.5745,0.2525,0.1235,0.1890,9,0,1,0,9.000000
2238,0.460,0.335,0.110,0.4440,0.2250,0.0745,0.1100,8,0,1,0,7.800050


In [133]:
predictions = knn_model.knn_regression(test_set,train_set,k=5,gamma=gamma)['Predicted Value']
mse = Evaluation.mean_squared_error(test_set['Rings'],predictions)
print(mse)

5.057641323216245


## Edited KNN ##

In [99]:
knn_model.edited_knn_classification(train_set,data_val)

Editing training set...
Zero one loss improved from 1 to 0.21098265895953758. Conitnuing...
Editing training set...
Zero one loss improved from 0.21098265895953758 to 0.20520231213872833. Conitnuing...
Editing training set...
Zero one loss improved from 0.20520231213872833 to 0.19653179190751446. Conitnuing...
Editing training set...
Zero-one loss degraded from 0.19653179190751446 to 0.1994219653179191. Stopping editing. 


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
129,0,1,0,2,1,0,unacc
899,2,0,1,0,2,2,unacc
89,0,0,3,0,2,2,unacc
1371,3,0,2,2,1,0,unacc
1227,2,3,1,1,1,0,unacc
...,...,...,...,...,...,...,...
762,1,3,0,0,2,0,unacc
1286,2,3,3,1,2,2,vgood
526,1,0,3,1,1,1,unacc
321,0,2,3,2,2,0,unacc


## Edited KNN Regression ###

In [136]:
epsilon = statistics.stdev(train_set[config['target_column']])

knn_model.edited_knn_regression(train_set, test_set, epsilon=epsilon,gamma=gamma)

Editing training set...
MSE improved from inf to 7.036526946107784. Conitnuing...
Editing training set...
MSE improved from 7.036526946107784 to 7.002994011976048. Conitnuing...
Editing training set...
MSE improved from 7.002994011976048 to 6.930538922155689. Conitnuing...


Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_F,Sex_I,Sex_M
3571,0.585,0.465,0.165,0.9355,0.4035,0.2275,0.259,9,1,0,0
1316,0.555,0.440,0.140,0.8705,0.4070,0.1560,0.255,9,0,0,1
1526,0.705,0.560,0.165,1.6750,0.7970,0.4095,0.388,10,0,0,1
776,0.505,0.385,0.145,0.6775,0.2360,0.1790,0.200,15,0,0,1
3094,0.525,0.400,0.130,0.6220,0.2655,0.1470,0.184,9,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
4120,0.350,0.265,0.090,0.1970,0.0730,0.0365,0.077,7,0,0,1
2699,0.670,0.500,0.200,1.2690,0.5760,0.2985,0.351,11,0,0,1
1934,0.620,0.470,0.150,1.3090,0.5870,0.4405,0.325,9,0,0,1
654,0.370,0.280,0.110,0.2305,0.0945,0.0465,0.075,10,1,0,0


## Hyperparameter Tuning ##

In [156]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
        
        data_val_stand = data_processor.standardize_data(train_set, data_val, features=features)
        data_train_stand = data_processor.standardize_data(train_set,train_set, features=features)

        predictions_1 = knn_model.knn_classifier(data_val_stand, data_train_stand, k=k)['Predicted Class']
        score_1 = Evaluation().zero_one_loss(data_val_stand[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average 0-1 Loss score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average 0-1 loss score of {scores_dict[best_k]}")


Average 0-1 Loss score with k=1: 0.09595375722543353
Average 0-1 Loss score with k=2: 0.10433526011560694
Average 0-1 Loss score with k=3: 0.06213872832369942
Average 0-1 Loss score with k=4: 0.054046242774566475
Average 0-1 Loss score with k=5: 0.054335260115606944
Average 0-1 Loss score with k=6: 0.05346820809248556
Average 0-1 Loss score with k=7: 0.054046242774566475
Average 0-1 Loss score with k=8: 0.05433526011560693
Average 0-1 Loss score with k=9: 0.054913294797687875
Average 0-1 Loss score with k=10: 0.0569364161849711
Average 0-1 Loss score with k=11: 0.06040462427745665
Average 0-1 Loss score with k=12: 0.0592485549132948
Average 0-1 Loss score with k=13: 0.0615606936416185
Average 0-1 Loss score with k=14: 0.0676300578034682
Best k is 6 with the lowest average 0-1 loss score of 0.05346820809248556


In [157]:
zero_one_loss_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
null_model_scores = []

# Lists to store null model metrics
null_model_precision_scores = []
null_model_recall_scores = []
null_model_f1_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
    data_test_stand = data_processor.standardize_data(train_set, test_set, features=features)
    data_train_stand = data_processor.standardize_data(train_set,train_set, features=features)

    # Train and evaluate 
    predictions_1 = knn_model.knn_classifier(data_test_stand, data_train_stand, k=best_k)['Predicted Class']
    
    zero_one_loss_score = Evaluation.zero_one_loss(data_test_stand[config['target_column']], predictions_1)
    precision_score = Evaluation.precision(data_test_stand[config['target_column']], predictions_1)
    recall_score = Evaluation.recall(data_test_stand[config['target_column']], predictions_1)
    f1_score = Evaluation.f1_score(data_test_stand[config['target_column']], predictions_1)
    
    zero_one_loss_scores.append(zero_one_loss_score)
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    f1_scores.append(f1_score)

    # Evaluate null model
    null_model_prediction = null_model.naive_classifier(test_set)
    null_model_zero_one_loss = Evaluation.zero_one_loss(test_set[config['target_column']], null_model_prediction)
    null_model_precision = Evaluation.precision(test_set[config['target_column']], null_model_prediction)
    null_model_recall = Evaluation.recall(test_set[config['target_column']], null_model_prediction)
    null_model_f1 = Evaluation.f1_score(test_set[config['target_column']], null_model_prediction)
    
    null_model_scores.append(null_model_zero_one_loss)
    null_model_precision_scores.append(null_model_precision)
    null_model_recall_scores.append(null_model_recall)
    null_model_f1_scores.append(null_model_f1)

average_01_score = sum(zero_one_loss_scores) / len(zero_one_loss_scores)
average_precision_score = sum(precision_scores) / len(precision_scores)
average_recall_score = sum(recall_scores) / len(recall_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_null_model_score = sum(null_model_scores) / len(null_model_scores)
average_null_model_precision = sum(null_model_precision_scores) / len(null_model_precision_scores)
average_null_model_recall = sum(null_model_recall_scores) / len(null_model_recall_scores)
average_null_model_f1 = sum(null_model_f1_scores) / len(null_model_f1_scores)

print(f"Average null model 0-1 loss score: {average_null_model_score}")
print(f"Average null model Precision score: {average_null_model_precision}")
print(f"Average null model Recall score: {average_null_model_recall}")
print(f"Average null model F1 score: {average_null_model_f1}")
print(f"Average KNN 0-1 score for k={best_k}: {average_01_score}")
print(f"Average Precision score for k={best_k}: {average_precision_score}")
print(f"Average Recall score for k={best_k}: {average_recall_score}")
print(f"Average F1 score for k={best_k}: {average_f1_score}")


Average null model 0-1 loss score: 0.3111432706222866
Average null model Precision score: 0.47452359360896035
Average null model Recall score: 0.6888567293777134
Average null model F1 score: 0.5619465350193514
Average KNN 0-1 score for k=6: 0.07554269175108538
Average Precision score for k=6: 0.9237701010622941
Average Recall score for k=6: 0.9244573082489147
Average F1 score for k=6: 0.9231272786104986
