In [32]:
from src.data_preprocessor import DataProcessor
from src.cross_validation import CrossValidation
from src.evaluation import Evaluation
from models.knn import KNN
from models.null_model import NullModelClassification, NullModelRegression
from data_configs.configs import *
import statistics
import numpy as np

config = forest_fires_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)
classification_nullmodel = NullModelClassification(config=config)
regression_nullmodel = NullModelRegression(config=config)
knn_model = KNN(config)
null_model = NullModelRegression(config=config)

### Data Load and Preprocessing ###

In [33]:
raw_data = data_processor.load_data()

data_1 = data_processor.impute_missing_values(raw_data)

data_2 = data_processor.encode_nominal_features(data_1)

data_3 = data_processor.encode_ordinal_features(data_2)

In [34]:
data_4 = data_processor.log_transform(data_3)

## KNN Model ##

### Hyperparameter Tuning ###

In [35]:
data_train, data_val = cross_validator.random_partition(data_4, random_state=42)

In [36]:
features=['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI','temp', 'RH', 'wind', 'rain', 'month','day']

In [37]:
gamma = 1/(statistics.stdev(data_train[config['target_column']]))
gamma

0.7265859330636893

In [38]:
statistics.stdev(data_3[config['target_column']])

63.65581846794089

#### Tuning k ####

In [39]:
hyperparameters = np.arange(1,15,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=k, gamma=gamma)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])
        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average MSE score of {scores_dict[best_k]}")


Average MSE score with k=1: 12827.605644615393
Average MSE score with k=2: 12286.82580512004
Average MSE score with k=3: 12199.416791307332
Average MSE score with k=4: 12164.780424775794
Average MSE score with k=5: 12142.075856759457
Average MSE score with k=6: 12129.468874487815
Average MSE score with k=7: 12124.364856806615
Average MSE score with k=8: 12119.406165249244
Average MSE score with k=9: 12116.522890864095
Average MSE score with k=10: 12112.839825728597
Average MSE score with k=11: 12111.576822898172
Average MSE score with k=12: 12109.65228327208
Average MSE score with k=13: 12108.095866533891
Average MSE score with k=14: 12108.522424369727
Best k is 13 with the lowest average MSE score of 12108.095866533891


#### Tuning Gamma ####

In [40]:
hyperparameters = np.arange(0.4,1.6,0.2)
scores_dict = {}

for g in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        data_train_standardized = data_processor.standardize_data(train_set_1, train_set_1, features=features)
        data_val_standardized = data_processor.standardize_data(train_set_1,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=best_k, gamma=gamma*g)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])

        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with g={round(g,2)}: {average_score}")
    scores_dict[g] = average_score

best_g = min(scores_dict, key=scores_dict.get)
print(f"Best g is {round(best_g,2)} with the lowest average MSE score of {scores_dict[best_g]}")


Average MSE score with g=0.4: 12091.422644001159
Average MSE score with g=0.6: 12094.075479423424
Average MSE score with g=0.8: 12098.645216298932
Average MSE score with g=1.0: 12108.095866533891
Average MSE score with g=1.2: 12126.838974780072
Average MSE score with g=1.4: 12157.67592372792
Average MSE score with g=1.6: 12198.037354553335
Best g is 0.4 with the lowest average MSE score of 12091.422644001159


### Model Performance ###

In [41]:
mse_scores = []
mae_scores = []
r2_scores = []
pearson_scores = []
null_model_scores = []

# Lists for storing null model metrics
null_model_mae_scores = []
null_model_r2_scores = []
null_model_pearson_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):

    data_train_standardized = data_processor.standardize_data(train_set, train_set, features=features)
    data_test_standardized = data_processor.standardize_data(train_set,test_set,features=features)  

    # Train and evaluate 
    predictions_1 = knn_model.knn_regression(data_test_standardized, data_train_standardized, k=best_k, gamma=best_g*gamma)['Predicted Value']
    predictions_1 = data_processor.inverse_log_transform(predictions_1)
    y_true = data_processor.inverse_log_transform(data_test_standardized[config['target_column']])
    
    mse_score = Evaluation.mean_squared_error(y_true, predictions_1)
    mae_score = Evaluation.mean_absolute_error(y_true, predictions_1)
    r2_score = Evaluation.r2_coefficient(y_true, predictions_1)
    pearson_score = Evaluation.pearsons_correlation(y_true, predictions_1)
    
    mse_scores.append(mse_score)
    mae_scores.append(mae_score)
    r2_scores.append(r2_score)
    pearson_scores.append(pearson_score)

    # Evaluate null model
    y_true = data_processor.inverse_log_transform(test_set[config['target_column']])
    null_model_prediction = null_model.naive_regression(test_set)
    null_model_prediction = data_processor.inverse_log_transform(null_model_prediction)
    null_model_mse = Evaluation.mean_squared_error(y_true, null_model_prediction)
    null_model_mae = Evaluation.mean_absolute_error(y_true, null_model_prediction)
    null_model_r2 = Evaluation.r2_coefficient(y_true, null_model_prediction)
    null_model_pearson = Evaluation.pearsons_correlation(y_true, null_model_prediction)
    
    null_model_scores.append(null_model_mse)
    null_model_mae_scores.append(null_model_mae)
    null_model_r2_scores.append(null_model_r2)
    null_model_pearson_scores.append(null_model_pearson)


average_mse_score = sum(mse_scores) / len(mse_scores)
average_mae_score = sum(mae_scores) / len(mae_scores)
average_r2_score = sum(r2_scores) / len(r2_scores)
average_pearson_score = sum(pearson_scores) / len(pearson_scores)
average_null_model_mse = sum(null_model_scores) / len(null_model_scores)
average_null_model_mae = sum(null_model_mae_scores) / len(null_model_mae_scores)
average_null_model_r2 = sum(null_model_r2_scores) / len(null_model_r2_scores)
average_null_model_pearson = sum(null_model_pearson_scores) / len(null_model_pearson_scores)

print(f"Average null model MSE score: {average_null_model_mse}")
print(f"Average null model MAE score: {average_null_model_mae}")
print(f"Average null model r2 score: {average_null_model_r2}")
print(f"Average null model pearson score: {average_null_model_pearson}")
print(f"Average MSE score for k={best_k}, g={round(best_g,2)}: {average_mse_score}")
print(f"Average MAE score for k={best_k}, g={round(best_g,2)}: {average_mae_score}")
print(f"Average r2 score for k={best_k}, g={round(best_g,2)}: {average_r2_score}")
print(f"Average pearson score for k={best_k}, g={round(best_g,2)}: {average_pearson_score}")




Average null model MSE score: 2161.2278374710386
Average null model MAE score: 11.249154171959376
Average null model r2 score: -0.05206367958446219
Average null model pearson score: nan
Average MSE score for k=13, g=0.4: 2158.384242204224
Average MAE score for k=13, g=0.4: 11.375752493133465
Average r2 score for k=13, g=0.4: -0.04617814691499349
Average pearson score for k=13, g=0.4: 0.016974665861499382




## Edited KNN ##

### Hyperparameter Tuning ###

In [42]:
epsilon = statistics.stdev(data_4[config['target_column']])
epsilon

1.398435955883445

#### Tuning Epsilon ####

In [43]:
hyperparameters = np.arange(0.1,2,0.2)
scores_dict = {}

for e in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        edited_train_set = knn_model.edited_knn_regression(train_set_1, train_set_2, epsilon=epsilon*e, gamma=gamma)

        data_train_standardized = data_processor.standardize_data(edited_train_set, edited_train_set, features=features)
        data_val_standardized = data_processor.standardize_data(edited_train_set,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=best_k, gamma=gamma*best_g)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])
        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with e={round(e,2)}: {average_score}")
    scores_dict[e] = average_score

best_e = min(scores_dict, key=scores_dict.get)
print(f"Best e is {round(best_e,2)} with the lowest average MSE score of {scores_dict[best_e]}")


Average MSE score with e=0.1: 12165.630446997091
Average MSE score with e=0.3: 12156.47544593398
Average MSE score with e=0.5: 12146.075361328856
Average MSE score with e=0.7: 12142.275800530082
Average MSE score with e=0.9: 12135.826145261988
Average MSE score with e=1.1: 12129.34348530095
Average MSE score with e=1.3: 12129.747411500051
Average MSE score with e=1.5: 12122.098280237715
Average MSE score with e=1.7: 12119.584280449677
Average MSE score with e=1.9: 12119.414022279254
Best e is 1.9 with the lowest average MSE score of 12119.414022279254


#### Tuning k ####

In [44]:
hyperparameters = np.arange(1,10,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        edited_train_set = knn_model.edited_knn_regression(train_set_1, train_set_2, epsilon=epsilon*best_e, gamma=gamma)

        data_train_standardized = data_processor.standardize_data(edited_train_set, edited_train_set, features=features)
        data_val_standardized = data_processor.standardize_data(edited_train_set,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=k, gamma=gamma*best_g)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])
        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average MSE score of {scores_dict[best_k]}")


Average MSE score with k=1: 12187.642614038468
Average MSE score with k=2: 12145.619952350293
Average MSE score with k=3: 12138.298076657185
Average MSE score with k=4: 12129.879791996118
Average MSE score with k=5: 12128.801611850988
Average MSE score with k=6: 12128.430449609394
Average MSE score with k=7: 12128.329095250663
Average MSE score with k=8: 12126.525195625658
Average MSE score with k=9: 12123.032612328298
Best k is 9 with the lowest average MSE score of 12123.032612328298


#### Tuning gamma ####

In [45]:
hyperparameters = np.arange(0.4,1.6,0.2)
scores_dict = {}

for g in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        edited_train_set = knn_model.edited_knn_regression(train_set_1, train_set_2, epsilon=epsilon*best_e, gamma = gamma)

        data_train_standardized = data_processor.standardize_data(edited_train_set, edited_train_set, features=features)
        data_val_standardized = data_processor.standardize_data(edited_train_set,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=best_k, gamma=g*gamma)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])
        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with g={round(g,2)}: {average_score}")
    scores_dict[g] = average_score

best_g = min(scores_dict, key=scores_dict.get)
print(f"Best g is {round(best_g,2)} with the lowest average MSE score of {scores_dict[best_g]}")


Average MSE score with g=0.4: 12123.032612328298
Average MSE score with g=0.6: 12123.631866484307
Average MSE score with g=0.8: 12124.758718155012
Average MSE score with g=1.0: 12126.566000473704
Average MSE score with g=1.2: 12128.894204600942
Average MSE score with g=1.4: 12131.444313844677
Average MSE score with g=1.6: 12134.007425000449
Best g is 0.4 with the lowest average MSE score of 12123.032612328298


In [46]:
mse_scores = []
mae_scores = []
r2_scores = []
pearson_scores = []
null_model_scores = []

# Lists for storing null model metrics
null_model_mae_scores = []
null_model_r2_scores = []
null_model_pearson_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):

    edited_train_set = knn_model.edited_knn_regression(train_set, data_val, epsilon=epsilon*best_e, gamma=gamma)

    data_train_standardized = data_processor.standardize_data(edited_train_set, edited_train_set, features=features)
    data_test_standardized = data_processor.standardize_data(edited_train_set,test_set,features=features)  

    # Train and evaluate 
    predictions_1 = knn_model.knn_regression(data_test_standardized, data_train_standardized, k=best_k, gamma=best_g*gamma)['Predicted Value']
    predictions_1 = data_processor.inverse_log_transform(predictions_1)
    y_true = data_processor.inverse_log_transform(data_test_standardized[config['target_column']])
    
    mse_score = Evaluation.mean_squared_error(y_true, predictions_1)
    mae_score = Evaluation.mean_absolute_error(y_true, predictions_1)
    r2_score = Evaluation.r2_coefficient(y_true, predictions_1)
    pearson_score = Evaluation.pearsons_correlation(y_true, predictions_1)
    
    mse_scores.append(mse_score)
    mae_scores.append(mae_score)
    r2_scores.append(r2_score)
    pearson_scores.append(pearson_score)

    # Evaluate null model
    y_true = data_processor.inverse_log_transform(test_set[config['target_column']])
    null_model_prediction = null_model.naive_regression(test_set)
    null_model_prediction = data_processor.inverse_log_transform(null_model_prediction)
    null_model_mse = Evaluation.mean_squared_error(y_true, null_model_prediction)
    null_model_mae = Evaluation.mean_absolute_error(y_true, null_model_prediction)
    null_model_r2 = Evaluation.r2_coefficient(y_true, null_model_prediction)
    null_model_pearson = Evaluation.pearsons_correlation(y_true, null_model_prediction)
    
    null_model_scores.append(null_model_mse)
    null_model_mae_scores.append(null_model_mae)
    null_model_r2_scores.append(null_model_r2)
    null_model_pearson_scores.append(null_model_pearson)


average_mse_score = sum(mse_scores) / len(mse_scores)
average_mae_score = sum(mae_scores) / len(mae_scores)
average_r2_score = sum(r2_scores) / len(r2_scores)
average_pearson_score = sum(pearson_scores) / len(pearson_scores)
average_null_model_mse = sum(null_model_scores) / len(null_model_scores)
average_null_model_mae = sum(null_model_mae_scores) / len(null_model_mae_scores)
average_null_model_r2 = sum(null_model_r2_scores) / len(null_model_r2_scores)
average_null_model_pearson = sum(null_model_pearson_scores) / len(null_model_pearson_scores)

print(f"Average null model MSE score: {average_null_model_mse}")
print(f"Average null model MAE score: {average_null_model_mae}")
print(f"Average null model r2 score: {average_null_model_r2}")
print(f"Average null model pearson score: {average_null_model_pearson}")
print(f"Average MSE score for k={best_k}, g={round(best_g,2)}: {average_mse_score}")
print(f"Average MAE score for k={best_k}, g={round(best_g,2)}: {average_mae_score}")
print(f"Average r2 score for k={best_k}, g={round(best_g,2)}: {average_r2_score}")
print(f"Average pearson score for k={best_k}, g={round(best_g,2)}: {average_pearson_score}")




Average null model MSE score: 2161.2278374710386
Average null model MAE score: 11.249154171959376
Average null model r2 score: -0.05206367958446219
Average null model pearson score: nan
Average MSE score for k=9, g=0.4: 2171.8326540217918
Average MAE score for k=9, g=0.4: 11.231100579099355
Average r2 score for k=9, g=0.4: -0.05869600556767714
Average pearson score for k=9, g=0.4: -0.0207621164695765




## Condensed Knn ##

#### Tuning Epsilon ####

In [47]:
hyperparameters = np.arange(0.1,1.5,0.1)
scores_dict = {}

for e in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        condesed_train_set = knn_model.condensed_knn_regression(train_set_1,epsilon=epsilon*e)
        data_train_standardized = data_processor.standardize_data(condesed_train_set, condesed_train_set, features=features)
        data_val_standardized = data_processor.standardize_data(condesed_train_set,data_val,features=features)

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=best_k, gamma=gamma*best_g)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])
        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with e={round(e,2)}: {average_score}")
    scores_dict[e] = average_score

best_e = min(scores_dict, key=scores_dict.get)
print(f"Best e is {round(best_e,2)} with the lowest average MSE score of {scores_dict[best_e]}")


Average MSE score with e=0.1: 12075.689711225627
Average MSE score with e=0.2: 12074.963502830307
Average MSE score with e=0.3: 12079.29479223938
Average MSE score with e=0.4: 12077.445894727274
Average MSE score with e=0.5: 12074.408221748346
Average MSE score with e=0.6: 12075.076508715918
Average MSE score with e=0.7: 12073.7317943993
Average MSE score with e=0.8: 12063.978418292872
Average MSE score with e=0.9: 12075.779713581109
Average MSE score with e=1.0: 12070.136343029193
Average MSE score with e=1.1: 12074.306525232587
Average MSE score with e=1.2: 12068.542900837343
Average MSE score with e=1.3: 12070.801100620294
Average MSE score with e=1.4: 12051.795920579094
Best e is 1.4 with the lowest average MSE score of 12051.795920579094


#### Tuning k ####

In [48]:
hyperparameters = np.arange(1,10,1)
scores_dict = {}

for k in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        condesed_train_set = knn_model.condensed_knn_regression(train_set_1,epsilon=epsilon*best_e)

        data_train_standardized = data_processor.standardize_data(condesed_train_set, condesed_train_set, features=features)
        data_val_standardized = data_processor.standardize_data(condesed_train_set,data_val,features=features)

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=k, gamma=gamma*best_g)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])
        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with k={k}: {average_score}")
    scores_dict[k] = average_score

best_k = min(scores_dict, key=scores_dict.get)
print(f"Best k is {best_k} with the lowest average MSE score of {scores_dict[best_k]}")


Average MSE score with k=1: 13747.827820480778
Average MSE score with k=2: 12425.284692834397
Average MSE score with k=3: 12122.537762722983
Average MSE score with k=4: 12087.74280228351
Average MSE score with k=5: 12063.606276632947
Average MSE score with k=6: 12057.830278564841
Average MSE score with k=7: 12035.235991569492
Average MSE score with k=8: 12064.738918689945
Average MSE score with k=9: 12055.058427898042
Best k is 7 with the lowest average MSE score of 12035.235991569492


#### Tuning Gamma ####

In [49]:
hyperparameters = np.arange(0.4,1.6,0.2)
scores_dict = {}

for g in hyperparameters: 
    scores = []
    for i, (train_set_1, train_set_2) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
        
        condesed_train_set = knn_model.condensed_knn_regression(train_set_1,epsilon=epsilon*best_e)

        data_train_standardized = data_processor.standardize_data(condesed_train_set, condesed_train_set, features=features)
        data_val_standardized = data_processor.standardize_data(condesed_train_set,data_val,features=features)  

        predictions_1 = knn_model.knn_regression(data_val_standardized, data_train_standardized, k=best_k, gamma=g*gamma)['Predicted Value']
        predictions_1 = data_processor.inverse_log_transform(predictions_1)
        
        y_true = data_processor.inverse_log_transform(data_val_standardized[config['target_column']])
        score_1 = Evaluation().mean_squared_error(y_true, predictions_1)
        scores.append(score_1)

    average_score = sum(scores) / len(scores)
    print(f"Average MSE score with g={round(g,2)}: {average_score}")
    scores_dict[g] = average_score

best_g = min(scores_dict, key=scores_dict.get)
print(f"Best g is {round(best_g,2)} with the lowest average MSE score of {scores_dict[best_g]}")


Average MSE score with g=0.4: 12050.585635563268
Average MSE score with g=0.6: 12069.773317030347
Average MSE score with g=0.8: 12114.494640808118
Average MSE score with g=1.0: 12136.120702091524
Average MSE score with g=1.2: 12171.25497061458
Average MSE score with g=1.4: 12326.774812711315
Average MSE score with g=1.6: 12739.333595220924
Best g is 0.4 with the lowest average MSE score of 12050.585635563268


### Model Performance ###

In [50]:
mse_scores = []
mae_scores = []
r2_scores = []
pearson_scores = []
null_model_scores = []

# Lists for storing null model metrics
null_model_mae_scores = []
null_model_r2_scores = []
null_model_pearson_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):

    condesed_train_set = knn_model.condensed_knn_classification(train_set)

    data_train_standardized = data_processor.standardize_data(condesed_train_set, condesed_train_set, features=features)
    data_test_standardized = data_processor.standardize_data(condesed_train_set,test_set,features=features)  

    # Train and evaluate 
    predictions_1 = knn_model.knn_regression(data_test_standardized, data_train_standardized, k=best_k, gamma=best_g*gamma)['Predicted Value']
    predictions_1 = data_processor.inverse_log_transform(predictions_1)
    y_true = data_processor.inverse_log_transform(data_test_standardized[config['target_column']])
    
    mse_score = Evaluation.mean_squared_error(y_true, predictions_1)
    mae_score = Evaluation.mean_absolute_error(y_true, predictions_1)
    r2_score = Evaluation.r2_coefficient(y_true, predictions_1)
    pearson_score = Evaluation.pearsons_correlation(y_true, predictions_1)
    
    mse_scores.append(mse_score)
    mae_scores.append(mae_score)
    r2_scores.append(r2_score)
    pearson_scores.append(pearson_score)

    # Evaluate null model
    y_true = data_processor.inverse_log_transform(test_set[config['target_column']])
    null_model_prediction = null_model.naive_regression(test_set)
    null_model_prediction = data_processor.inverse_log_transform(null_model_prediction)
    null_model_mse = Evaluation.mean_squared_error(y_true, null_model_prediction)
    null_model_mae = Evaluation.mean_absolute_error(y_true, null_model_prediction)
    null_model_r2 = Evaluation.r2_coefficient(y_true, null_model_prediction)
    null_model_pearson = Evaluation.pearsons_correlation(y_true, null_model_prediction)
    
    null_model_scores.append(null_model_mse)
    null_model_mae_scores.append(null_model_mae)
    null_model_r2_scores.append(null_model_r2)
    null_model_pearson_scores.append(null_model_pearson)


average_mse_score = sum(mse_scores) / len(mse_scores)
average_mae_score = sum(mae_scores) / len(mae_scores)
average_r2_score = sum(r2_scores) / len(r2_scores)
average_pearson_score = sum(pearson_scores) / len(pearson_scores)
average_null_model_mse = sum(null_model_scores) / len(null_model_scores)
average_null_model_mae = sum(null_model_mae_scores) / len(null_model_mae_scores)
average_null_model_r2 = sum(null_model_r2_scores) / len(null_model_r2_scores)
average_null_model_pearson = sum(null_model_pearson_scores) / len(null_model_pearson_scores)

print(f"Average null model MSE score: {average_null_model_mse}")
print(f"Average null model MAE score: {average_null_model_mae}")
print(f"Average null model r2 score: {average_null_model_r2}")
print(f"Average null model pearson score: {average_null_model_pearson}")
print(f"Average MSE score for k={best_k}, g={round(best_g,2)}: {average_mse_score}")
print(f"Average MAE score for k={best_k}, g={round(best_g,2)}: {average_mae_score}")
print(f"Average r2 score for k={best_k}, g={round(best_g,2)}: {average_r2_score}")
print(f"Average pearson score for k={best_k}, g={round(best_g,2)}: {average_pearson_score}")




Average null model MSE score: 2161.2278374710386
Average null model MAE score: 11.249154171959376
Average null model r2 score: -0.05206367958446219
Average null model pearson score: nan
Average MSE score for k=7, g=0.4: 2149.80484710132
Average MAE score for k=7, g=0.4: 11.743525742016795
Average r2 score for k=7, g=0.4: -0.04035822620084404
Average pearson score for k=7, g=0.4: 0.01500515864695674


