In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification, NullModelRegression
from data_configs.configs import *
import statistics
import numpy as np

config = machine_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
classification_nullmodel = NullModelClassification(config=config)
regression_nullmodel = NullModelRegression(config=config)
knn_model = KNN(config)

### Data Load and Preprocessing ###

In [2]:
raw_data = data_processor.load_data()

raw_data_2 = raw_data.drop(columns=['vendor_name', 'model_name', 'ERP'])

data_1 = data_processor.impute_missing_values(raw_data_2)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

## KNN Model ##

### Hyperparameter Tuning ###

In [3]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [4]:
features=['MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX']

In [5]:
gamma = 1/(statistics.stdev(data_train[config['target_column']]))

#### Tuning k ####

In [6]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with k={k}: {average_score}")
    scores_dict[k] = average_score

# Find the k with the lowest average score
best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average score of {scores_dict[best_k]}")


NameError: name 'data_train_standardized' is not defined

#### Tuning Gamma ####

In [None]:
hyperparameters = np.arange(0.1,2,0.2)
scores_dict = {}

for g in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=best_k, gamma=gamma*g)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with g={round(g,2)}: {average_score}")
    scores_dict[g] = average_score

# Find the k with the lowest average score
best_g = min(scores_dict, key=scores_dict.get)
print(f"Best g is {round(best_g,2)} with the lowest average score of {scores_dict[best_g]}")


Average score with g=0.1: 17798.197619047616
Average score with g=0.3: 17798.19761904762
Average score with g=0.5: 17798.197619047616
Average score with g=0.7: 17798.19761904762
Average score with g=0.9: 17798.197619047616
Average score with g=1.1: 17798.197619047616
Average score with g=1.3: 17798.197619047616
Average score with g=1.5: 17798.197619047616
Average score with g=1.7: 17798.197619047616
Average score with g=1.9: 17798.197619047616
Best g is 0.1 with the lowest average score of 17798.197619047616


### Model Performance ###

In [None]:
scores = []
for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
    
    data_train_standardized = data_processor.standardize_data(train_set, train_set, features=features)
    data_test_standardized = data_processor.standardize_data(train_set,data_val,features=features)  

    # Train and evaluate 
    predictions_1 = knn_model.knn_regression(data_test_standardized, data_train_standardized, k=best_k, gamma=best_g*gamma)['Predicted Value']
    score = Evaluation().mean_squared_error(data_test_standardized[config['target_column']], predictions_1)
    scores.append(score)  

average_score = sum(scores) / len(scores)
print(f"Average score for k={best_k}, g={round(best_g,2)}: {average_score}")

Average score for k=1, g=0.1: 9120.061431440046


## Edited KNN ##

### Hyperparameter Tuning ###

In [None]:
epsilon = statistics.stdev(data_3[config['target_column']])
epsilon

160.83073308779512

#### Tuning Epsilon ####

In [None]:
hyperparameters = np.arange(0.1,2,0.2)
scores_dict = {}

for e in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        edited_train_set = knn_model.edited_knn_regression(data_train_standardized,epsilon=epsilon*e)

        predictions_1 = knn_model.knn_regression(data_val_standardized, edited_train_set, k=best_k, gamma=best_g)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with e={round(e,2)}: {average_score}")
    scores_dict[e] = average_score

# Find the k with the lowest average score
best_e = min(scores_dict, key=scores_dict.get)
print(f"Best e is {round(best_e,2)} with the lowest average score of {scores_dict[best_e]}")


Average score with e=0.1: 39347.9119047619
Average score with e=0.3: 34349.5880952381
Average score with e=0.5: 32581.45952380952
Average score with e=0.7: 29542.10714285714
Average score with e=0.9: 27883.99047619048
Average score with e=1.1: 27196.84761904762
Average score with e=1.3: 24044.104761904764
Average score with e=1.5: 23662.730952380953
Average score with e=1.7: 20709.992857142854
Average score with e=1.9: 19479.547619047615
Best e is 1.9 with the lowest average score of 19479.547619047615


#### Tuning k ####

In [None]:
hyperparameters = np.arange(1,10,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        edited_train_set = knn_model.edited_knn_regression(data_train_standardized,epsilon=epsilon*e)

        predictions_1 = knn_model.knn_regression(data_val_standardized, edited_train_set, k=k, gamma=best_g)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with k={k}: {average_score}")
    scores_dict[k] = average_score

# Find the k with the lowest average score
best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average score of {scores_dict[best_k]}")


Average score with k=1: 19479.547619047615
Average score with k=2: 21297.64757883795
Average score with k=3: 21758.046519656644
Average score with k=4: 21484.66049463321
Average score with k=5: 21516.505136336687
Average score with k=6: 21429.592751070762
Average score with k=7: 21373.617419826307
Average score with k=8: 21517.056962455048
Average score with k=9: 21658.09816548178
Best k is 1 with the lowest average score of 19479.547619047615


#### Tuning gamma ####

In [None]:
hyperparameters = np.arange(0.1,2,0.2)
scores_dict = {}

for g in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        edited_train_set = knn_model.edited_knn_regression(data_train_standardized,epsilon=epsilon*e)

        predictions_1 = knn_model.knn_regression(data_val_standardized, edited_train_set, k=best_k, gamma=g*gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with g={round(g,2)}: {average_score}")
    scores_dict[g] = average_score

# Find the k with the lowest average score
best_g = min(scores_dict, key=scores_dict.get)
print(f"Best g is {round(best_g,2)} with the lowest average score of {scores_dict[best_g]}")


Average score with g=0.1: 19479.547619047615
Average score with g=0.3: 19479.54761904762
Average score with g=0.5: 19479.547619047615
Average score with g=0.7: 19479.54761904762
Average score with g=0.9: 19479.547619047615
Average score with g=1.1: 19479.547619047615
Average score with g=1.3: 19479.547619047615
Average score with g=1.5: 19479.547619047615
Average score with g=1.7: 19479.547619047615
Average score with g=1.9: 19479.547619047615
Best g is 0.1 with the lowest average score of 19479.547619047615


In [None]:
scores = []
for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
    
    data_train_standardized = data_processor.standardize_data(train_set, train_set, features=features)
    data_test_standardized = data_processor.standardize_data(train_set,test_set,features=features)  

    edited_train_set = knn_model.edited_knn_regression(data_train_standardized,epsilon=epsilon*e)
   
    predictions_1 = knn_model.knn_regression(data_test_standardized, edited_train_set, k=best_k, gamma=best_g*gamma)['Predicted Value']
    score = Evaluation().mean_squared_error(data_test_standardized[config['target_column']], predictions_1)
    scores.append(score)  

average_score = sum(scores) / len(scores)
print(f"Average score for k={best_k}, g={round(best_g,2)}, e={round(best_e,2)}: {average_score}")

Average score for k=1, g=0.1, e=1.9: 9119.874856569133


## Condensed Knn ##

In [None]:
hyperparameters = np.arange(0.1,2,0.2)
scores_dict = {}

for e in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        edited_train_set = knn_model.condensed_knn_regression(data_train_standardized,epsilon=epsilon*e)

        predictions_1 = knn_model.knn_regression(data_val_standardized, edited_train_set, k=best_k, gamma=best_g)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with e={round(e,2)}: {average_score}")
    scores_dict[e] = average_score

# Find the k with the lowest average score
best_e = min(scores_dict, key=scores_dict.get)
print(f"Best e is {round(best_e,2)} with the lowest average score of {scores_dict[best_e]}")


Average score with e=0.1: 18080.01428571429
Average score with e=0.3: 18596.211904761905
Average score with e=0.5: 17734.683333333334
Average score with e=0.7: 16327.942857142854
Average score with e=0.9: 17448.911904761906
Average score with e=1.1: 17414.016666666666
Average score with e=1.3: 22832.476190476194
Average score with e=1.5: 23639.245238095238
Average score with e=1.7: 25818.533333333333
Average score with e=1.9: 27266.85
Best e is 0.7 with the lowest average score of 16327.942857142854


In [None]:
hyperparameters = np.arange(1,10,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        edited_train_set = knn_model.condensed_knn_regression(data_train_standardized,epsilon=epsilon*e)

        predictions_1 = knn_model.knn_regression(data_val_standardized, edited_train_set, k=k, gamma=best_g)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with k={k}: {average_score}")
    scores_dict[k] = average_score

# Find the k with the lowest average score
best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average score of {scores_dict[best_k]}")


Average score with k=1: 17625.688095238096
Average score with k=2: 16232.760536683923
Average score with k=3: 17012.50211262436
Average score with k=4: 17695.03130123066
Average score with k=5: 21568.079251297175
Average score with k=6: 21111.64424965665
Average score with k=7: 22603.02531618776
Average score with k=8: 22462.649146159052
Average score with k=9: 22575.533020410636
Best k is 2 with the lowest average score of 16232.760536683923


In [None]:
hyperparameters = np.arange(0.1,2,0.2)
scores_dict = {}

for g in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        edited_train_set = knn_model.condensed_knn_regression(data_train_standardized,epsilon=epsilon*e)

        predictions_1 = knn_model.knn_regression(data_val_standardized, edited_train_set, k=best_k, gamma=g*gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val_standardized[config['target_column']], predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average score with g={round(g,2)}: {average_score}")
    scores_dict[g] = average_score

# Find the k with the lowest average score
best_g = min(scores_dict, key=scores_dict.get)
print(f"Best g is {round(best_g,2)} with the lowest average score of {scores_dict[best_g]}")


Average score with g=0.1: 29641.18627155772
Average score with g=0.3: 29640.094904720092
Average score with g=0.5: 29639.03640589374
Average score with g=0.7: 29638.01147848658
Average score with g=0.9: 29637.020777284495
Average score with g=1.1: 29636.06490761118
Average score with g=1.3: 29635.14442467476
Average score with g=1.5: 29634.259833102755
Average score with g=1.7: 29633.411586665472
Average score with g=1.9: 29632.60008818691
Best g is 1.9 with the lowest average score of 29632.60008818691


In [None]:
scores = []
for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
    
    data_train_standardized = data_processor.standardize_data(train_set, train_set, features=features)
    data_test_standardized = data_processor.standardize_data(train_set_1,test_set,features=features)  

    edited_train_set = knn_model.edited_knn_regression(data_train_standardized,epsilon=epsilon*best_e)
   
    predictions_1 = knn_model.knn_regression(data_test_standardized, edited_train_set, k=best_k, gamma=best_g*gamma)['Predicted Value']
    score = Evaluation().mean_squared_error(data_test_standardized[config['target_column']], predictions_1)
    scores.append(score)  

average_score = sum(scores) / len(scores)
print(f"Average score for k={best_k}, g={round(best_g,2)}, e={round(best_e,2)}: {average_score}")

Average score for k=2, g=1.9, e=0.7: 8737.066086407704
