In [1]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.neural_networks import *
from src.cross_validation import CrossValidation
import numpy as np

config = albalone_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)

In [2]:
raw_data = data_processor.load_data()
data_1 = data_processor.impute_missing_values(raw_data)
data_2 = data_processor.encode_nominal_features(data_1)
data_3 = data_processor.encode_ordinal_features(data_2)
data_4 = data_processor.standardize_data(data_3,data_3,features=['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight'])

In [3]:
data_4 = data_4[['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight','Viscera weight', 'Shell weight','Sex_F', 'Sex_I', 'Sex_M','Rings']]

In [4]:
data_4

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex_F,Sex_I,Sex_M,Rings
0,-0.574489,-0.432097,-1.064297,-0.641821,-0.607613,-0.726125,-0.638140,0,0,1,15
1,-1.448812,-1.439757,-1.183837,-1.230130,-1.170770,-1.205077,-1.212842,0,0,1,7
2,0.050027,0.122116,-0.107978,-0.309432,-0.463444,-0.356647,-0.207114,1,0,0,9
3,-0.699393,-0.432097,-0.347058,-0.637743,-0.648160,-0.607527,-0.602222,0,0,1,10
4,-1.615350,-1.540523,-1.422916,-1.271933,-1.215822,-1.287183,-1.320599,0,1,0,7
...,...,...,...,...,...,...,...,...,...,...,...
4172,0.341468,0.424414,0.609261,0.118799,0.047902,0.532836,0.073053,1,0,0,11
4173,0.549640,0.323648,-0.107978,0.279896,0.358765,0.309325,0.155666,0,0,1,10
4174,0.632909,0.676328,1.565580,0.708127,0.748470,0.975296,0.496895,0,0,1,9
4175,0.841081,0.777094,0.250642,0.541933,0.773248,0.733540,0.410690,1,0,0,10


In [5]:
data_train, data_val = cross_validator.random_partition(data_4, random_state=42)

In [7]:
import random

iterations = 15

param_space = {
    'lr': [0.01,0.001,0.0001,0.00001],
    'epochs': np.linspace(3000, 20000, num=20).astype(int).tolist()
}

best_score = float('inf')
best_params_linear = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
    
        train_data = train_set.to_numpy()
        X_train = train_data[:,:-1]
        y_train = train_data[:,-1:]

        data_test = data_val.to_numpy()
        X_val = data_test[:,:-1]
        y_val = data_test[:,-1:]
        
        linear = LinearNetwork(config)

        _, val_losses = linear.linear_regression(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=500)

        score = np.min(val_losses)
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 20:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop
        
    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_linear = params
        

print(f"Best parameters: {best_params_linear}, Best score: {best_score}")


Tested params: {'lr': 0.0001, 'epochs': 11947}, Score: 6.224881536986364
Tested params: {'lr': 0.0001, 'epochs': 9263}, Score: 6.926692583308901
Tested params: {'lr': 0.001, 'epochs': 11052}, Score: 5.130409488778847
Tested params: {'lr': 0.001, 'epochs': 5684}, Score: 5.209099016971225
Skipping params: {'lr': 1e-05, 'epochs': 17315} due to high score: 45.78044724513928
Tested params: {'lr': 1e-05, 'epochs': 17315}, Score: 45.78044724513928
Tested params: {'lr': 0.01, 'epochs': 3894}, Score: 5.0560583252168865
Skipping params: {'lr': 1e-05, 'epochs': 3000} due to high score: 92.09659052283982
Tested params: {'lr': 1e-05, 'epochs': 3000}, Score: 92.09659052283982
Tested params: {'lr': 0.001, 'epochs': 19105}, Score: 5.104042898387968
Tested params: {'lr': 0.01, 'epochs': 3000}, Score: 5.075658899157532
Skipping params: {'lr': 1e-05, 'epochs': 7473} due to high score: 73.29338909174757
Tested params: {'lr': 1e-05, 'epochs': 7473}, Score: 73.29338909174757
Tested params: {'lr': 0.0001, 'e

In [8]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(2000, 20000, 2000).tolist(),
    'n_hidden': np.arange(2*X_val.shape[1], 60, 15)
}

best_score = float('inf')
best_params_ffn = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
           
        train_data = train_set.to_numpy()
        X_train = train_data[:,:-1]
        y_train = train_data[:,-1:]

        data_test = data_val.to_numpy()
        X_val = data_test[:,:-1]
        y_val = data_test[:,-1:]

        ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=params['n_hidden'],n_hidden_2=params['n_hidden'],n_output=y_train.shape[1])

        _, val_losses, _ = ffn.train(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=500)

        score = np.min(val_losses)
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 20:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_ffn = params
        

print(f"Best parameters: {best_params_ffn}, Best score: {best_score}")


Tested params: {'lr': 1e-06, 'epochs': 10000, 'n_hidden': 20}, Score: 4.932624596982651
Tested params: {'lr': 1e-07, 'epochs': 12000, 'n_hidden': 50}, Score: 5.406511466471694
Tested params: {'lr': 1e-05, 'epochs': 16000, 'n_hidden': 35}, Score: 4.634804633538947
Tested params: {'lr': 1e-06, 'epochs': 14000, 'n_hidden': 35}, Score: 4.854718897912445
Tested params: {'lr': 1e-05, 'epochs': 10000, 'n_hidden': 50}, Score: 4.703881600510603
Tested params: {'lr': 1e-06, 'epochs': 2000, 'n_hidden': 35}, Score: 5.278860637245122
Tested params: {'lr': 0.0001, 'epochs': 6000, 'n_hidden': 35}, Score: 4.536369881095922
Tested params: {'lr': 0.0001, 'epochs': 16000, 'n_hidden': 20}, Score: 4.530217503837951
Tested params: {'lr': 1e-05, 'epochs': 12000, 'n_hidden': 35}, Score: 4.620793099844055
Tested params: {'lr': 1e-07, 'epochs': 8000, 'n_hidden': 35}, Score: 6.134717690319052
Tested params: {'lr': 1e-06, 'epochs': 18000, 'n_hidden': 35}, Score: 4.802601369555132
Tested params: {'lr': 1e-06, 'epo

In [9]:
import random

iterations = 15

param_space = {
    'lr': [0.001,0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(5000, 20000, 2000).tolist(),
    'n_encoder': np.arange(2,X_val.shape[1]-1,1).tolist()
}

best_score = float('inf')
best_params_auto = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
    
        train_data = train_set.to_numpy()
        X_train = train_data[:,:-1]
        y_train = train_data[:,-1:]

        data_test = data_val.to_numpy()
        X_val = data_test[:,:-1]
        y_val = data_test[:,-1:]

        # Check if the shapes of X_train and X_val are not equal
        if X_train.shape[1] != X_val.shape[1]:
            # print(f"Shape mismatch between training and validation sets, skipping params: {params}")
            continue

        autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=params['n_encoder'])

        losses = autoE.train(X_train, max_epochs=params['epochs'], lr=params['lr'])

        score = np.min(losses)
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score.")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_auto = params
        

print(f"Best parameters: {best_params_auto}, Best score: {best_score}")


Tested params: {'lr': 0.0001, 'epochs': 7000, 'n_encoder': 7}, Score: 0.011142300926712954
Skipping params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 3} due to high score.
Tested params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 3}, Score: 0.6414673409349017
Tested params: {'lr': 1e-06, 'epochs': 19000, 'n_encoder': 5}, Score: 0.14016937194371948
Skipping params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 6} due to high score.
Tested params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 6}, Score: 0.6476254300443668
Skipping params: {'lr': 1e-07, 'epochs': 7000, 'n_encoder': 2} due to high score.
Tested params: {'lr': 1e-07, 'epochs': 7000, 'n_encoder': 2}, Score: 0.7591108680629965
Skipping params: {'lr': 1e-07, 'epochs': 13000, 'n_encoder': 5} due to high score.
Tested params: {'lr': 1e-07, 'epochs': 13000, 'n_encoder': 5}, Score: 0.7116709988461631
Tested params: {'lr': 0.0001, 'epochs': 15000, 'n_encoder': 2}, Score: 0.09298157896431988
Tested params: {'lr': 0.0001, 'epochs': 50

In [10]:
autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
losses[-1]

0.008197955587910236

In [11]:
import random

iterations = 15

param_space = {
    'lr': [0.001,0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(5000, 20000, 2000).tolist(),
    'n_hidden_2': np.arange(X_val.shape[1], 60, 15)
}

best_score = float('inf')
best_params_combined = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []
    print(f"Testing params: {params}")

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):
    
        train_data = train_set.to_numpy()
        X_train = train_data[:,:-1]
        y_train = train_data[:,-1:]

        data_test = data_val.to_numpy()
        X_val = data_test[:,:-1]
        y_val = data_test[:,-1:]

        # Check if the shapes of X_train and X_val are not equal
        if X_train.shape[1] != X_val.shape[1]:
            # print(f"Shape mismatch between training and validation sets, skipping params: {params}")
            continue
        
        combined = CombinedModel(autoE,n_hidden_2=params['n_hidden_2'],n_output=y_val.shape[1])

        _, val_losses, _ = combined.train(X_train,y_train,X_val,y_val,epochs=params['epochs'], lr=params['lr'],patience=500)


        score = np.min(val_losses)
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 20:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_combined = params
        

print(f"Best parameters: {best_params_combined}, Best score: {best_score}")


Testing params: {'lr': 1e-05, 'epochs': 5000, 'n_hidden_2': 40}
Tested params: {'lr': 1e-05, 'epochs': 5000, 'n_hidden_2': 40}, Score: 4.535415198682274
Testing params: {'lr': 1e-07, 'epochs': 5000, 'n_hidden_2': 10}
Tested params: {'lr': 1e-07, 'epochs': 5000, 'n_hidden_2': 10}, Score: 7.827619121448042
Testing params: {'lr': 1e-06, 'epochs': 13000, 'n_hidden_2': 55}
Tested params: {'lr': 1e-06, 'epochs': 13000, 'n_hidden_2': 55}, Score: 4.812818983545966
Testing params: {'lr': 0.0001, 'epochs': 9000, 'n_hidden_2': 55}
Tested params: {'lr': 0.0001, 'epochs': 9000, 'n_hidden_2': 55}, Score: 4.514683362912136
Testing params: {'lr': 1e-05, 'epochs': 7000, 'n_hidden_2': 25}
Tested params: {'lr': 1e-05, 'epochs': 7000, 'n_hidden_2': 25}, Score: 4.502164767784976
Testing params: {'lr': 0.0001, 'epochs': 5000, 'n_hidden_2': 40}
Tested params: {'lr': 0.0001, 'epochs': 5000, 'n_hidden_2': 40}, Score: 4.519822238339893
Testing params: {'lr': 0.0001, 'epochs': 13000, 'n_hidden_2': 40}
Tested par

  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  train_metric = np.mean((y_train - A_output)**2)
  val_metric = np.mean((y_val - val_output)**2)
  error_hidden_2 = np.dot(error_output, self.W_output.T) * (1 - np.power(A2, 2))  # Derivative of tanh is (1 - tanh^2)


Tested params: {'lr': 0.001, 'epochs': 7000, 'n_hidden_2': 10}, Score: nan
Testing params: {'lr': 1e-05, 'epochs': 9000, 'n_hidden_2': 25}
Tested params: {'lr': 1e-05, 'epochs': 9000, 'n_hidden_2': 25}, Score: 4.488700173434539
Testing params: {'lr': 0.001, 'epochs': 7000, 'n_hidden_2': 40}


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  train_metric = np.mean((y_train - A_output)**2)
  val_metric = np.mean((y_val - val_output)**2)
  error_hidden_2 = np.dot(error_output, self.W_output.T) * (1 - np.power(A2, 2))  # Derivative of tanh is (1 - tanh^2)


Tested params: {'lr': 0.001, 'epochs': 7000, 'n_hidden_2': 40}, Score: nan
Testing params: {'lr': 1e-05, 'epochs': 17000, 'n_hidden_2': 55}
Tested params: {'lr': 1e-05, 'epochs': 17000, 'n_hidden_2': 55}, Score: 4.485420908101411
Testing params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden_2': 55}
Tested params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden_2': 55}, Score: 4.9228314424000486
Testing params: {'lr': 1e-07, 'epochs': 15000, 'n_hidden_2': 40}
Tested params: {'lr': 1e-07, 'epochs': 15000, 'n_hidden_2': 40}, Score: 6.370845794432813
Testing params: {'lr': 1e-06, 'epochs': 11000, 'n_hidden_2': 10}
Tested params: {'lr': 1e-06, 'epochs': 11000, 'n_hidden_2': 10}, Score: 4.928120295467215
Best parameters: {'lr': 1e-05, 'epochs': 17000, 'n_hidden_2': 55}, Best score: 4.485420908101411


## Model Performance ##

In [12]:
linear_scores = []
ffn_scores = []
combined_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=False)):

    train_data = train_set.to_numpy()
    X_train = train_data[:,:-1]
    y_train = train_data[:,-1:]

    test_data = test_set.to_numpy()
    X_test = test_data[:,:-1]
    y_test = test_data[:,-1:]

    # Check if the shapes of X_train and X_val are not equal
    if X_train.shape[1] != X_test.shape[1]:
        # print(f"Shape mismatch between training and validation sets, skipping params: {params}")
        continue
    
    linear = LinearNetwork(config)
    _, linear_val_losses = linear.linear_regression(X_train,y_train,X_test,y_test,epochs=best_params_linear['epochs'],lr=best_params_linear['lr'],patience=np.inf)

    ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=best_params_ffn['n_hidden'],n_hidden_2=best_params_ffn['n_hidden'],n_output=y_train.shape[1])
    _, ffn_val_losses, _ = ffn.train(X_train,y_train,X_test,y_test,epochs=best_params_ffn['epochs'],lr=best_params_ffn['lr'],patience=np.inf)

    autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
    losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
    combined = CombinedModel(autoE,n_hidden_2=best_params_combined['n_hidden_2'],n_output=y_test.shape[1])
    _, combined_val_losses, _ = combined.train(X_train,y_train,X_test,y_test,epochs=best_params_combined['epochs'], lr=best_params_combined['lr'],patience=np.inf)


    linear_score = np.min(linear_val_losses)
    ffn_score = np.min(ffn_val_losses)
    combined_score = np.min(combined_val_losses)
    
    linear_scores.append(linear_score)
    ffn_scores.append(ffn_score)
    combined_scores.append(combined_score)

avg_score_linear = np.mean(linear_scores)
avg_score_ffn = np.mean(ffn_scores)
avg_score_combined = np.mean(combined_scores)

print(f"Linear Model Tested params: {best_params_linear}, Average Score: {avg_score_linear}")
print(f"FFN Model Tested params: {best_params_ffn}, Average Score: {avg_score_ffn}")
print(f"Combined Model Tested params: {best_params_combined}, Average Score: {avg_score_combined}")

print(f"Linear Model Scores: {linear_scores}")
print(f"FFN Model Scores: {ffn_scores}")
print(f"Combined Model Scores: {combined_scores}")



Linear Model Tested params: {'lr': 0.01, 'epochs': 3894}, Average Score: 5.081909324580722
FFN Model Tested params: {'lr': 0.0001, 'epochs': 10000, 'n_hidden': 35}, Average Score: 4.394401392631373
Combined Model Tested params: {'lr': 1e-05, 'epochs': 17000, 'n_hidden_2': 55}, Average Score: 4.344777063982498
Linear Model Scores: [5.809406116884592, 4.793012210467181, 5.465858876551785, 4.666771027663747, 5.043882064901638, 5.1621634050926115, 4.726841476488124, 5.190187318642598, 5.232252486084321, 4.728718263030621]
FFN Model Scores: [4.643064184452969, 4.166576754780411, 4.569919154360123, 4.2192206778044685, 4.199261079256107, 4.581815574283167, 4.269606010838749, 4.398800396333017, 4.595445152367934, 4.300304941836788]
Combined Model Scores: [4.62771837604953, 4.094526039509127, 4.504686190503922, 4.149422573405508, 4.3154868136913676, 4.477113102003675, 4.17220430229352, 4.324080173840717, 4.501118467294102, 4.281414601233508]


In [21]:
from scipy import stats
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Combine all scores into a single array, and create an array of labels
scores = np.concatenate([linear_scores, ffn_scores, combined_scores])
labels = ['Linear'] * len(linear_scores) + ['FFN'] * len(ffn_scores) + ['Combined'] * len(combined_scores)

# Conduct ANOVA
anova_result = stats.f_oneway(linear_scores, ffn_scores, combined_scores)
print(f"ANOVA result: F-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}")

# If ANOVA shows significant differences, conduct post-hoc testing with Tukey's HSD
if anova_result.pvalue < 0.05:
    tukey = pairwise_tukeyhsd(endog=scores, groups=labels, alpha=0.05)
    print(tukey)


ANOVA result: F-statistic = 25.151350953823872, p-value = 6.804288513426419e-07
 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
Combined    FFN   0.0496 0.9047 -0.2384 0.3377  False
Combined Linear   0.7371    0.0  0.4491 1.0252   True
     FFN Linear   0.6875    0.0  0.3994 0.9756   True
-----------------------------------------------------


In [22]:
from scipy import stats

t_stat, p_val = stats.ttest_ind(linear_scores, ffn_scores)
print(f"Linear Model vs. FFN Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing Linear Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(linear_scores, combined_scores)
print(f"Linear Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing FFN Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(ffn_scores, combined_scores)
print(f"FFN Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

Linear Model vs. FFN Model: t-statistic = 5.257840946704737, p-value = 5.3316086515416334e-05
Linear Model vs. Combined Model: t-statistic = 5.692834416500847, p-value = 2.126862990358088e-05
FFN Model vs. Combined Model: t-statistic = 0.6094214396589686, p-value = 0.5498609458488894


## Archive ##

In [13]:
# autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=4)

# autoE.train(X_train, max_epochs=10000, lr=0.0001)

In [14]:
# combined = CombinedModel(autoE,n_hidden_2=24,n_output=1)

# MSEs, val_metrics, final_mse = combined.train(X_train,y_train,X_val,y_val,epochs=3000,lr=0.00001)

In [15]:
# import matplotlib.pyplot as plt

# plt.plot(MSEs)
# plt.plot(val_metrics)

In [16]:
# ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=24,n_hidden_2=24,n_output=1)

# MSEs, val_metrics, final_mse = ffn.train(X_train,y_train,X_val,y_val,3000,0.000001)

In [17]:
# import matplotlib.pyplot as plt

# plt.plot(MSEs)
# plt.plot(val_metrics)

In [18]:
# linear = LinearNetwork(config)

# losses = linear.linear_regression(X_train,y_train,X_val,y_val)

In [19]:
# autoE.W_encoder