In [2]:
# Import necessary modules and functions
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from src.knn_regression import parameter_tuning_knn, generate_neighbors
from src.random_forest import parameter_tuning_forest
from src.neural_nets import create_neural_1, create_neural_3, create_neural_6, parameter_tuning_nn
from data.data_generator import get_data, preprocess

In [3]:

def save_results(results, filename="experiment_results.json"):
    save_path = '/results/'
    full_path = os.path.join(save_path, filename)
    with open(full_path, 'w') as file:
        json.dump(results, file)

def regression_func(x):
    return np.exp(np.linalg.norm(x, axis=1))

def mean_error(base_seed, sigma, omega):
    mse = []  # Initialize the list to store MSE values

    x_val, y_val = get_data(regression_func,
                                num_samples=10**5,
                                seed=seed+50,
                                x_dim=7,
                                sigma=0,
                                omega=omega)
    
    for i in range(50):
        seed = base_seed + i
        
        x, y = get_data(regression_func,
                        num_samples=100,
                        seed=seed,
                        x_dim=7,
                        sigma=sigma,
                        omega=omega)

        
        y_avg = np.mean(y)
        # Ensure y_avg is broadcasted to match the shape of y_val for MSE calculation
        y_pred = np.full_like(y_val, fill_value=y_avg)
        
        e = mean_squared_error(y_val, y_pred)
        mse.append(e)

    return np.median(mse)



def run_experiment(sigma, omega, num_samples, repeats=50, save_interval=1, base_seed=42):
    
    mse_avg = mean_error(base_seed=base_seed,
                         sigma=sigma,
                         omega=omega)
    
    results = {"KNN" : [],
               "RandomForest" : [],
               "neural-1": [],
               "neural-3": [],
               "neural-6": [],
               "mse_avg" : mse_avg}

    for i in range(repeats):
        # Seed is adjusted each iteration to ensure different datasets
        # but is based on a consistent base to ensure reproducibility
        seed = base_seed + i
        
        x, y = get_data(regression_func,
                        num_samples=num_samples,
                        seed=seed,
                        x_dim=7,
                        sigma=sigma,
                        omega=omega)
        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.2,
                                                            random_state=seed)
        x_val, y_val = get_data(regression_func,
                                num_samples=10**5,
                                seed=seed+repeats,
                                x_dim=7,
                                sigma=0,
                                omega=omega)  
        
        # K-NN Regression
        units = generate_neighbors(n=x_train.shape[0])
        knn_model, config_knn = parameter_tuning_knn(units=units,
                                                     train_data=(x_train, y_train),
                                                     test_data=(x_test, y_test))
        y_pred = knn_model.predict(x_val)
        mse_knn = mean_squared_error(y_pred, y_val)/mse_avg
            
        results["KNN"].append(mse_knn)

        # Random Forest Regression
        units = [100+ 30*j for j in range(int(num_samples/4))]
        forest, config_forest = parameter_tuning_forest(units,
                                                        train_data=(x_train, y_train),
                                                        test_data=(x_test, y_test))
        y_pred = forest.predict(x_val)
        mse_forest = mean_squared_error(y_pred, y_val)/mse_avg

        results["RandomForest"].append(mse_forest)


        # Neural Network with one hidden layer
        train_data = preprocess(x_train, y_train)
        test_data = preprocess(x_test, y_test)

        units = [5,10,25,50,75]
        nn_1, config_nn_1 = parameter_tuning_nn(create_neural_1,
                                               units=units,
                                               train_data=train_data,
                                               test_data=test_data,
                                               input_dim=7,
                                               epochs=1000)
        y_pred = nn_1.predict(x_val)
        mse_nn_1 = mean_squared_error(y_pred, y_val)/mse_avg
            
        results["neural-1"].append(mse_nn_1)

        # Neural Network with three hidden layer
        units = [3, 6, 9, 12, 15]
        nn_3, config_nn_3 = parameter_tuning_nn(create_neural_3,
                                                units=units,
                                                train_data=train_data,
                                                test_data=test_data,
                                                input_dim=7,
                                                epochs=1000)
        y_pred = nn_3.predict(x_val)
        mse_nn_3 = mean_squared_error(y_pred, y_val)/mse_avg
            
        results["neural-3"].append(mse_nn_3)

        # Neural Network with six hidden layer
        units = [3, 6, 9, 12, 15]
        nn_6, config_nn_6 = parameter_tuning_nn(create_neural_6,
                                                units=units,
                                                train_data=train_data,
                                                test_data=test_data,
                                                input_dim=7,
                                                epochs=1000)
        y_pred = nn_6.predict(x_val)
        mse_nn_6 = mean_squared_error(y_pred, y_val)/mse_avg
            
        results["neural-6"].append(mse_nn_6)
        
        if (i + 1) % save_interval == 0 or i == repeats - 1:
            save_results(results, filename=f"results:_{sigma}_{num_samples}.json")
            print(f"Results saved at iteration {i+1}")

    models = ["KNN","RandomForest", "neural-1", "neural-3", "neural-6"]
    for model in models:
        median_iqr = {model : {"Median" : np.median(results[model]),
                               "IQR" : np.percentile(results[model],0.75)-np.percentile(results[model],0.25)}}
    median_iqr["mse_avg"] = mse_avg
    return median_iqr