In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification, NullModelRegression
from data_configs.configs import *
import statistics

config = machine_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
classification_nullmodel = NullModelClassification(config=config)
regression_nullmodel = NullModelRegression(config=config)
knn_model = KNN(config)

In [2]:
# Data Processing

raw_data = data_processor.load_data()

raw_data_2 = raw_data.drop(columns=['ERP'])

data_1 = data_processor.impute_missing_values(raw_data_2)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [3]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [8]:
len(data_train)

167

In [4]:
gamma = 1/(statistics.stdev(data_train[config['target_column']]))

In [9]:
condensed_data = knn_model.condensed_knn_regression(data_train, 20)
condensed_data

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,vendor_name_adviser,vendor_name_amdahl,vendor_name_apollo,...,model_name_v8635,model_name_v8650,model_name_v8655,model_name_v8665,model_name_v8670,model_name_vax:11/730,model_name_vax:11/750,model_name_vax:11/780,model_name_vs-100,model_name_vs-90
23,110,3100,6200,0,6,64,76,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150,40,8000,16000,32,8,16,214,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,26,16000,32000,64,8,24,465,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199,30,8000,64000,128,12,176,1150,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68,105,1000,4000,0,3,24,32,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,48,4000,24000,32,8,24,214,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,200,1000,8000,0,1,2,36,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49,200,512,8000,8,1,8,62,0,0,0,...,0,0,0,0,0,0,0,1,0,0
89,140,2000,8000,32,1,54,66,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
edited_data = knn_model.edited_knn_regression(data_train,10)
edited_data

Unnamed: 0,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,vendor_name_adviser,vendor_name_amdahl,vendor_name_apollo,...,model_name_v8635,model_name_v8650,model_name_v8655,model_name_v8665,model_name_v8670,model_name_vax:11/730,model_name_vax:11/750,model_name_vax:11/780,model_name_vs-100,model_name_vs-90
68,105,1000,4000,0,3,24,32,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,203,1000,2000,0,1,5,24,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,320,128,6000,0,1,12,23,0,0,0,...,0,0,0,0,0,0,0,0,0,0
128,50,2000,16000,24,1,6,70,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42,50,2000,16000,8,3,6,52,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,143,1500,6300,0,5,32,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,75,3000,8000,8,3,48,64,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,350,64,64,0,1,4,10,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92,140,2000,4000,8,1,20,22,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
        # Train and evaluate using train_set_1
        predictions_1 = knn_model.knn_regression(data_val, train_set_1, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_1)
        scores.append(score_1)
        
        # Train and evaluate using train_set_2
        predictions_2 = knn_model.knn_regression(data_val, train_set_2, k=k, gamma=gamma)['Predicted Value']
        score_2 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_2)
        scores.append(score_2)

    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")


Average score for k=1: 11528.142857142859
Average score for k=2: 12922.108144783126
Average score for k=3: 16424.46366408856
Average score for k=4: 17371.365163741175
Average score for k=5: 16990.236004259794
Average score for k=6: 19494.956533707184
Average score for k=7: 20492.108384587475
Average score for k=8: 20528.6678803064
Average score for k=9: 21224.937575889086
Average score for k=10: 22397.190423988628
Average score for k=11: 22943.849721396426
Average score for k=12: 23400.802497292483
Average score for k=13: 23953.22994740678
Average score for k=14: 25116.058407827375
Average score for k=15: 25472.614082708467


In [23]:
scores = []
for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
    
    # Train and evaluate 
    predictions_1 = knn_model.knn_regression(test_set, train_set, k=3, gamma=gamma)['Predicted Value']
    score = Evaluation().mean_squared_error(test_set[config['target_column']], predictions_1)
    scores.append(score)

average_score = sum(scores) / len(scores)
print(f"Average score for k=3: {average_score}")

Average score for k=3: 6760.419239163724


In [14]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
        # Train and evaluate using train_set_1
        condensed_train_set = knn_model.condensed_knn_regression(train_set_1, 10, k=1)
        predictions_1 = knn_model.knn_regression(data_val, condensed_train_set, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_1)
        scores.append(score_1)
    
    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")


Average score for k=1: 11723.173809523809


In [None]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
        # Train and evaluate using train_set_1
        edited_train_set = knn_model.edited_knn_regression(train_set_1, 10, k=1)
        predictions_1 = knn_model.knn_regression(data_val, edited_train_set, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_1)
        scores.append(score_1)
    
    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")
