In [2]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification, NullModelRegression
from data_configs.configs import *
import statistics

config = machine_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
classification_nullmodel = NullModelClassification(config=config)
regression_nullmodel = NullModelRegression(config=config)
knn_model = KNN(config)

In [16]:
# Data Processing

raw_data = data_processor.load_data()

raw_data_2 = raw_data.drop(columns=['vendor_name', 'model_name', 'ERP'])

data_1 = data_processor.impute_missing_values(raw_data_2)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [18]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [19]:
gamma = 1/(statistics.stdev(data_train[config['target_column']]))

In [20]:
hyperparameters = [1,2,3,4,5,6,7,8,9,10]

In [49]:
hyperparameters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
for k in hyperparameters:  # Directly use 'k' to iterate over the values
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, stratify=False)):
        # Train and evaluate using train_set_1
        predictions_1 = knn_model.knn_regression(data_val, train_set_1, k=k, gamma=gamma)['Predicted Value']
        score_1 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_1)
        scores.append(score_1)
        
        # Train and evaluate using train_set_2
        predictions_2 = knn_model.knn_regression(data_val, train_set_2, k=k, gamma=gamma)['Predicted Value']
        score_2 = Evaluation().mean_squared_error(data_val[config['target_column']], predictions_2)
        scores.append(score_2)

    average_score = sum(scores) / len(scores)
    print(f"Average score for k={k}: {average_score}")


Average score for k=2: 12689.154258658229
Average score for k=3: 14631.805360321412
Average score for k=4: 17011.99869660073
Average score for k=5: 17825.926828384305
Average score for k=6: 19346.679955776763
Average score for k=7: 19680.299944110804
Average score for k=8: 21123.295874463674
Average score for k=9: 21449.41307225766
Average score for k=10: 22267.19813488562


In [16]:
best_score = float('inf')
best_params = None

for params in hyperparameters:
    temp_scores = []  # Temporary list to store scores for current hyperparameters

    # Since your cross-validation setup already handles data splitting,
    # directly use the train_set for training
    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=5, n_repeats=2, stratify=False)):
        
        # Perform regression with the current set of hyperparameters on the training set
        # and use the validation set (data_val) for predictions
        predictions = knn_model.knn_regression(data_val, train_set, params['k'], params['gamma'])['Predicted Value']
        
        # Calculate the MSE on the validation set
        score = Evaluation.mean_squared_error(data_val[config['target_column']].values, predictions)
        temp_scores.append(score)
    
    # Calculate the average score for these hyperparameters
    avg_score = sum(temp_scores) / len(temp_scores)
    
    # Update best hyperparameters based on average score
    if avg_score < best_score:
        best_score = avg_score
        best_params = params

print(f"Best Hyperparameters: {best_params}, with Average MSE: {best_score}")

# Assuming best_params have been identified
final_scores = []

# Assuming cross_validator is an instance of your CrossValidation class
# and it has a method random_partition that correctly splits the data
for _ in range(5):  # Repeat 5 times for validation
    # Correctly using the split halves from random_partition
    data_half_1, data_half_2 = cross_validator.random_partition(data_train, val_size=0.5, random_state=None)

    # Iterate over the correct variable names for the training subsets
    for train_subset in [data_half_1, data_half_2]:
        # Assuming knn_model.knn_regression() is correctly implemented to train on train_subset and predict on data_val
        model_predictions = knn_model.knn_regression(data_val, train_subset, best_params['k'], best_params['gamma'])
        score = Evaluation.mean_squared_error(data_val[config['target_column']].values, model_predictions['Predicted Value'].values)
        final_scores.append(score)

final_average_score = sum(final_scores) / len(final_scores)
print(f"Final Validation Average MSE: {final_average_score}")



Best Hyperparameters: {'k': 2, 'gamma': 0.002157198402353915}, with Average MSE: 7819.872907441032
Final Validation Average MSE: 11765.719776719368
