In [1]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification, NullModelRegression
from data_configs.configs import *
import statistics

config = machine_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
classification_nullmodel = NullModelClassification(config=config)
regression_nullmodel = NullModelRegression(config=config)
knn_model = KNN(config)

In [3]:
# Data Processing

raw_data = data_processor.load_data()

raw_data_2 = raw_data.drop(columns=['vendor_name', 'model_name', 'ERP'])

data_1 = data_processor.impute_missing_values(raw_data_2)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [5]:
len(data_train)

167

In [6]:
gamma = 1/(statistics.stdev(data_train[config['target_column']]))

In [8]:
y_pred = knn_model.knn_regression(data_val, data_train, k=3, gamma=gamma)['Predicted Value']
y_true = data_val[config['target_column']]

In [11]:
mean_squared_error = Evaluation.mean_squared_error(y_true,y_pred)
mean_absolute_error = Evaluation.mean_absolute_error(y_true,y_pred)
pearsons = Evaluation.pearsons_correlation(y_true,y_pred)
r2 = Evaluation.r2_coefficient(y_true,y_pred)

print(mean_squared_error)
print(mean_absolute_error)
print(pearsons)
print(r2)


7180.144110940322
45.97683371699105
0.9269983376413438
0.8589552552715553


In [19]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
        # Train and evaluate using train_set_1
        predictions_1 = knn_model.knn_regression(data_val, train_set_1, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_1)
        scores.append(score_1)
        
        # Train and evaluate using train_set_2
        predictions_2 = knn_model.knn_regression(data_val, train_set_2, k=k, gamma=gamma)['Predicted Value']
        score_2 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_2)
        scores.append(score_2)

    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")


Average score for k=1: 14747.816666666662
Average score for k=2: 12057.709573541897
Average score for k=3: 15740.173597644369
Average score for k=4: 16065.49321887955
Average score for k=5: 17650.53665647917
Average score for k=6: 18655.912174566423
Average score for k=7: 19798.93073418003
Average score for k=8: 21147.523273674735
Average score for k=9: 21260.478827043356
Average score for k=10: 22652.243578825844


In [20]:
scores = []
for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
    
    # Train and evaluate 
    predictions_1 = knn_model.knn_regression(test_set, train_set, k=2, gamma=gamma)['Predicted Value']
    score = Evaluation().mean_squared_error(test_set[config['target_column']], predictions_1)
    scores.append(score)

average_score = sum(scores) / len(scores)
print(f"Average score for k=3: {average_score}")

Average score for k=3: 6594.961867407115


In [21]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
        # Train and evaluate using train_set_1
        condensed_train_set = knn_model.condensed_knn_regression(train_set_1, 10, k=1)
        predictions_1 = knn_model.knn_regression(data_val, condensed_train_set, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_1)
        scores.append(score_1)
    
    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")


Average score for k=1: 9356.478571428572
Average score for k=2: 13282.71818106153
Average score for k=3: 14902.262662967642
Average score for k=4: 16895.08135359349
Average score for k=5: 18402.23146516731
Average score for k=6: 19202.779387705454
Average score for k=7: 19645.42328272045
Average score for k=8: 20230.15793869439
Average score for k=9: 21369.272678890313
Average score for k=10: 22153.610026442842


In [22]:
hyperparameters = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
        # Train and evaluate using train_set_1
        edited_train_set = knn_model.edited_knn_regression(train_set_1, 10, k=1)
        predictions_1 = knn_model.knn_regression(data_val, edited_train_set, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_1)
        scores.append(score_1)
    
    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")


Average score for k=1: 41032.94523809524
Average score for k=2: 36709.586386263494
Average score for k=3: 42864.94885268181
Average score for k=4: 41257.95830890771
Average score for k=5: 42727.95063529443
Average score for k=6: 46681.28897307767
Average score for k=7: 46989.845029083546
Average score for k=8: 44899.23974069616
Average score for k=9: 48892.41752829828
Average score for k=10: 45828.07034246535


In [23]:
null_model = NullModelRegression(config)

null_model_results = null_model.naive_regression(data_val)
Evaluation().mean_squared_error(data_val[config['target_column']],null_model_results)

50906.85317460319