In [1]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.neural_networks import *
from src.cross_validation import CrossValidation
import numpy as np

config = car_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)

In [2]:
raw_data = data_processor.load_data()
data_1 = data_processor.impute_missing_values(raw_data)
data_2 = data_processor.encode_ordinal_features(data_1)
data_3 = data_processor.standardize_data(data_2,data_2,features=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)


In [3]:
data_3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
0,-1.341253,-1.341253,-1.341253,-1.22439,-1.22439,-1.22439,unacc
1,-1.341253,-1.341253,-1.341253,-1.22439,-1.22439,0.00000,unacc
2,-1.341253,-1.341253,-1.341253,-1.22439,-1.22439,1.22439,unacc
3,-1.341253,-1.341253,-1.341253,-1.22439,0.00000,-1.22439,unacc
4,-1.341253,-1.341253,-1.341253,-1.22439,0.00000,0.00000,unacc
...,...,...,...,...,...,...,...
1723,1.341253,1.341253,1.341253,1.22439,0.00000,0.00000,good
1724,1.341253,1.341253,1.341253,1.22439,0.00000,1.22439,vgood
1725,1.341253,1.341253,1.341253,1.22439,1.22439,-1.22439,unacc
1726,1.341253,1.341253,1.341253,1.22439,1.22439,0.00000,good


In [4]:
data_train, data_val = cross_validator.random_partition(data_3, random_state=42)

In [5]:
data_val = data_processor.encode_nominal_features(data_val)

In [6]:
data_val

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class_acc,Class_good,Class_unacc,Class_vgood
599,-0.447084,-0.447084,0.447084,-1.22439,0.00000,1.22439,0,0,1,0
1201,0.447084,1.341253,-1.341253,0.00000,0.00000,0.00000,1,0,0,0
628,-0.447084,-0.447084,1.341253,-1.22439,1.22439,0.00000,0,0,1,0
1498,1.341253,-0.447084,1.341253,0.00000,0.00000,0.00000,1,0,0,0
1263,0.447084,1.341253,0.447084,1.22439,0.00000,-1.22439,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
100,-1.341253,-1.341253,1.341253,1.22439,-1.22439,0.00000,0,0,1,0
274,-1.341253,0.447084,0.447084,-1.22439,0.00000,0.00000,0,0,1,0
1206,0.447084,1.341253,-1.341253,1.22439,-1.22439,-1.22439,0,0,1,0
101,-1.341253,-1.341253,1.341253,1.22439,-1.22439,1.22439,0,0,1,0


In [7]:
data_test = data_val.to_numpy()
X_val = data_test[:,:-4]
y_val = data_test[:,-4:]

In [8]:
data_train

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Class
107,-1.341253,-1.341253,1.341253,1.22439,1.22439,1.22439,unacc
901,0.447084,-1.341253,-0.447084,0.00000,-1.22439,0.00000,unacc
1709,1.341253,1.341253,1.341253,-1.22439,1.22439,1.22439,unacc
706,-0.447084,0.447084,0.447084,-1.22439,0.00000,0.00000,unacc
678,-0.447084,0.447084,-0.447084,-1.22439,0.00000,-1.22439,unacc
...,...,...,...,...,...,...,...
1130,0.447084,0.447084,-0.447084,1.22439,0.00000,1.22439,vgood
1294,0.447084,1.341253,1.341253,1.22439,1.22439,0.00000,good
860,-0.447084,1.341253,1.341253,1.22439,0.00000,1.22439,acc
1459,1.341253,-0.447084,0.447084,-1.22439,-1.22439,0.00000,unacc


In [9]:
import random

iterations = 15

param_space = {
    'lr': [0.001,0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.linspace(5000, 20000, num=20).astype(int).tolist()
}

best_score = float('inf')
best_params_linear = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    print(f"Testing params: {params}")

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-4]
        y_train = train_data[:,-4:]

        linear = LinearNetwork(config)

        _, val_losses = linear.logistic_regression(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=np.inf)

        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score.")
            break  # Exit the current for-loop
        
    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_linear = params
        

print(f"Best parameters: {best_params_linear}, Best score: {best_score}")


Testing params: {'lr': 0.0001, 'epochs': 15263}
Skipping params: {'lr': 0.0001, 'epochs': 15263} due to high score.
Tested params: {'lr': 0.0001, 'epochs': 15263}, Score: 0.41081176080693943
Testing params: {'lr': 0.0001, 'epochs': 8947}
Skipping params: {'lr': 0.0001, 'epochs': 8947} due to high score.
Tested params: {'lr': 0.0001, 'epochs': 8947}, Score: 0.41770325635835276
Testing params: {'lr': 0.001, 'epochs': 16842}
Skipping params: {'lr': 0.001, 'epochs': 16842} due to high score.
Tested params: {'lr': 0.001, 'epochs': 16842}, Score: 0.40158658154749993
Testing params: {'lr': 0.001, 'epochs': 17631}
Skipping params: {'lr': 0.001, 'epochs': 17631} due to high score.
Tested params: {'lr': 0.001, 'epochs': 17631}, Score: 0.40154822284753333
Testing params: {'lr': 1e-06, 'epochs': 11315}
Skipping params: {'lr': 1e-06, 'epochs': 11315} due to high score.
Tested params: {'lr': 1e-06, 'epochs': 11315}, Score: 0.6253265810638988
Testing params: {'lr': 0.001, 'epochs': 8947}
Skipping par

In [10]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(5000, 20000, 2000).tolist(),
    'n_hidden': np.arange(X_val.shape[1], 60, 15)
}

best_score = float('inf')
best_params_ffn = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    print(f"Testing params: {params}")

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-4]
        y_train = train_data[:,-4:]

        ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=params['n_hidden'],n_hidden_2=params['n_hidden'],n_output=y_train.shape[1])

        _, val_losses, _ = ffn.train(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=500)

        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score.")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_ffn = params
        

print(f"Best parameters: {best_params_ffn}, Best score: {best_score}")


Testing params: {'lr': 1e-05, 'epochs': 9000, 'n_hidden': 6}
Skipping params: {'lr': 1e-05, 'epochs': 9000, 'n_hidden': 6} due to high score.
Tested params: {'lr': 1e-05, 'epochs': 9000, 'n_hidden': 6}, Score: 0.8678054958021445
Testing params: {'lr': 1e-05, 'epochs': 15000, 'n_hidden': 51}
Skipping params: {'lr': 1e-05, 'epochs': 15000, 'n_hidden': 51} due to high score.
Tested params: {'lr': 1e-05, 'epochs': 15000, 'n_hidden': 51}, Score: 0.45562835589304573
Testing params: {'lr': 0.0001, 'epochs': 15000, 'n_hidden': 51}
Tested params: {'lr': 0.0001, 'epochs': 15000, 'n_hidden': 51}, Score: 0.09680569367472854
Testing params: {'lr': 0.0001, 'epochs': 5000, 'n_hidden': 51}
Skipping params: {'lr': 0.0001, 'epochs': 5000, 'n_hidden': 51} due to high score.
Tested params: {'lr': 0.0001, 'epochs': 5000, 'n_hidden': 51}, Score: 0.232982181600254
Testing params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden': 51}
Skipping params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden': 51} due to high score.
Te

In [11]:
import random

iterations = 15

param_space = {
    'lr': [0.001,0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(5000, 20000, 2000).tolist(),
    'n_encoder': np.arange(2,X_val.shape[1]-1,1).tolist()
}

best_score = float('inf')
best_params_auto = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-4]
        y_train = train_data[:,-4:]

        autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=params['n_encoder'])

        losses = autoE.train(X_train, max_epochs=params['epochs'], lr=params['lr'])

        score = losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score.")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_auto = params
        

print(f"Best parameters: {best_params_auto}, Best score: {best_score}")


Skipping params: {'lr': 1e-07, 'epochs': 9000, 'n_encoder': 2} due to high score.
Tested params: {'lr': 1e-07, 'epochs': 9000, 'n_encoder': 2}, Score: 0.9988959734291747
Skipping params: {'lr': 1e-05, 'epochs': 11000, 'n_encoder': 2} due to high score.
Tested params: {'lr': 1e-05, 'epochs': 11000, 'n_encoder': 2}, Score: 0.6683539166262461
Skipping params: {'lr': 0.0001, 'epochs': 17000, 'n_encoder': 4} due to high score.
Tested params: {'lr': 0.0001, 'epochs': 17000, 'n_encoder': 4}, Score: 0.318924853703842
Skipping params: {'lr': 0.001, 'epochs': 5000, 'n_encoder': 4} due to high score.
Tested params: {'lr': 0.001, 'epochs': 5000, 'n_encoder': 4}, Score: 0.3442933670017246
Skipping params: {'lr': 0.0001, 'epochs': 9000, 'n_encoder': 2} due to high score.
Tested params: {'lr': 0.0001, 'epochs': 9000, 'n_encoder': 2}, Score: 0.6461132372907222
Skipping params: {'lr': 1e-06, 'epochs': 19000, 'n_encoder': 4} due to high score.
Tested params: {'lr': 1e-06, 'epochs': 19000, 'n_encoder': 4

In [12]:
autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
losses[-1]

0.3111830550283207

In [13]:
import random

iterations = 15

param_space = {
    'lr': [0.001,0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(5000, 20000, 2000).tolist(),
    'n_hidden_2': np.arange(X_val.shape[1], 60, 15)
}

best_score = float('inf')
best_params_combined = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []
    print(f"Testing params: {params}")

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-4]
        y_train = train_data[:,-4:]

        combined = CombinedModel(autoE,n_hidden_2=params['n_hidden_2'],n_output=y_val.shape[1])

        _, val_losses, _ = combined.train(X_train,y_train,X_val,y_val,epochs=params['epochs'], lr=params['lr'],patience=500)


        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_combined = params
        

print(f"Best parameters: {best_params_combined}, Best score: {best_score}")


Testing params: {'lr': 1e-07, 'epochs': 15000, 'n_hidden_2': 21}
Skipping params: {'lr': 1e-07, 'epochs': 15000, 'n_hidden_2': 21} due to high score: 1.1610906645825398
Tested params: {'lr': 1e-07, 'epochs': 15000, 'n_hidden_2': 21}, Score: 1.1610906645825398
Testing params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 6}
Skipping params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 6} due to high score: 0.8805642774982249
Tested params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 6}, Score: 0.8805642774982249
Testing params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 6}
Skipping params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 6} due to high score: 0.8804484655709752
Tested params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 6}, Score: 0.8804484655709752
Testing params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 21}
Skipping params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 21} due to high score: 0.8795834328435462
Tested params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2

## Model Performance ##

In [14]:
linear_scores = []
ffn_scores = []
combined_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):

    train_set = data_processor.encode_nominal_features(train_set)
    test_set = data_processor.encode_nominal_features(test_set)

    train_data = train_set.to_numpy()
    X_train = train_data[:,:-4]
    y_train = train_data[:,-4:]

    test_data = test_set.to_numpy()
    X_test = test_data[:,:-4]
    y_test = test_data[:,-4:]

    linear = LinearNetwork(config)
    _, linear_val_losses = linear.logistic_regression(X_train,y_train,X_test,y_test,epochs=best_params_linear['epochs'],lr=best_params_linear['lr'],patience=np.inf)

    ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=best_params_ffn['n_hidden'],n_hidden_2=best_params_ffn['n_hidden'],n_output=y_train.shape[1])
    _, ffn_val_losses, _ = ffn.train(X_train,y_train,X_test,y_test,epochs=best_params_ffn['epochs'],lr=best_params_ffn['lr'],patience=np.inf)

    autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
    losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
    combined = CombinedModel(autoE,n_hidden_2=best_params_combined['n_hidden_2'],n_output=y_test.shape[1])
    _, combined_val_losses, _ = combined.train(X_train,y_train,X_test,y_test,epochs=best_params_combined['epochs'], lr=best_params_combined['lr'],patience=np.inf)


    linear_score = np.min(linear_val_losses)
    ffn_score = np.min(ffn_val_losses)
    combined_score = np.min(combined_val_losses)
    
    linear_scores.append(linear_score)
    ffn_scores.append(ffn_score)
    combined_scores.append(combined_score)

avg_score_linear = np.mean(linear_scores)
avg_score_ffn = np.mean(ffn_scores)
avg_score_combined = np.mean(combined_scores)

print(f"Linear Model Tested params: {best_params_linear}, Average Score: {avg_score_linear}")
print(f"FFN Model Tested params: {best_params_ffn}, Average Score: {avg_score_ffn}")
print(f"Combined Model Tested params: {best_params_combined}, Average Score: {avg_score_combined}")

print(f"Linear Model Scores: {linear_scores}")
print(f"FFN Model Scores: {ffn_scores}")
print(f"Combined Model Scores: {combined_scores}")



Linear Model Tested params: {'lr': 0.001, 'epochs': 17631}, Average Score: 0.3973078572142352
FFN Model Tested params: {'lr': 0.0001, 'epochs': 15000, 'n_hidden': 51}, Average Score: 0.07246082076067989
Combined Model Tested params: {'lr': 0.001, 'epochs': 19000, 'n_hidden_2': 36}, Average Score: 0.10001335266412095
Linear Model Scores: [0.3986343388674607, 0.4041138755568258, 0.3848313992905454, 0.4167407095898887, 0.3802121430335066, 0.40762362921082995, 0.40725013280011935, 0.37885617086436985, 0.4025237252336589, 0.39229244769514704]
FFN Model Scores: [0.07682131200474152, 0.07063944627187413, 0.08969222970421535, 0.07518292713970606, 0.05707371343739654, 0.07887621356307986, 0.056863501508397654, 0.0831171840295786, 0.055184485504538236, 0.08115719444327095]
Combined Model Scores: [0.0955876646569639, 0.12608108467117493, 0.13471493406796306, 0.06294310293858896, 0.06872114685291315, 0.09097500268679101, 0.09417760912482494, 0.10049405767327041, 0.08863619864823079, 0.137802725320

In [28]:
from scipy import stats

t_stat, p_val = stats.ttest_ind(linear_scores, ffn_scores)
print(f"Linear Model vs. FFN Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing Linear Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(linear_scores, combined_scores)
print(f"Linear Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing FFN Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(ffn_scores, combined_scores)
print(f"FFN Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

Linear Model vs. FFN Model: t-statistic = 58.16474758665112, p-value = 6.05544773243553e-22
Linear Model vs. Combined Model: t-statistic = 32.78207132920138, p-value = 1.669517911184232e-17
FFN Model vs. Combined Model: t-statistic = -3.0665096959370204, p-value = 0.006648539731434936


In [29]:
from scipy import stats
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Combine all scores into a single array, and create an array of labels
scores = np.concatenate([linear_scores, ffn_scores, combined_scores])
labels = ['Linear'] * len(linear_scores) + ['FFN'] * len(ffn_scores) + ['Combined'] * len(combined_scores)

# Conduct ANOVA
anova_result = stats.f_oneway(linear_scores, ffn_scores, combined_scores)
print(f"ANOVA result: F-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}")

# If ANOVA shows significant differences, conduct post-hoc testing with Tukey's HSD
if anova_result.pvalue < 0.05:
    tukey = pairwise_tukeyhsd(endog=scores, groups=labels, alpha=0.05)
    print(tukey)


ANOVA result: F-statistic = 1002.5949860886334, p-value = 4.633324440573921e-26
 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower   upper  reject
------------------------------------------------------
Combined    FFN  -0.0276 0.0054 -0.0475 -0.0076   True
Combined Linear   0.2973    0.0  0.2773  0.3172   True
     FFN Linear   0.3248    0.0  0.3049  0.3448   True
------------------------------------------------------


## Archive ##

In [16]:
# data_train = data_processor.encode_nominal_features(data_train)
# # data_val = data_processor.encode_nominal_features(data_val)     

In [17]:
# data = data_train.to_numpy()
# X_train = data[:,:-4]
# y_train = data[:,-4:]

In [18]:
# data_test = data_val.to_numpy()
# X_val = data_test[:,:-4]
# y_val = data_test[:,-4:]

In [19]:
# X_train.shape

In [20]:
# autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=4)

# autoE.train(X_train, max_epochs=20000, lr=0.0001)

In [21]:
# combined = CombinedModel(autoE,n_hidden_2=20,n_output=y_val.shape[1])

# loss, val_metrics, final_loss = combined.train(X_train,y_train,X_val,y_val,epochs=15000,lr=0.0001)

In [22]:
# val_metrics

In [23]:
# import matplotlib.pyplot as plt

# plt.plot(loss)
# plt.plot(val_metrics)

In [24]:
# ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=20,n_hidden_2=20,n_output=y_train.shape[1])

# loss, val_metrics, final_mse = ffn.train(X_train,y_train,X_val,y_val,10000,0.0001)

In [25]:
# plt.plot(loss)
# plt.plot(val_metrics)

In [26]:
# linear = LinearNetwork(config)

# losses, val_losses = linear.logistic_regression(X_train,y_train,X_val,y_val,epochs=20000,lr=0.0001)
# val_losses[-1]

In [27]:
# import matplotlib.pyplot as plt
# plt.plot(losses)
# plt.plot(val_losses)