In [9]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.neural_networks import *
from src.cross_validation import CrossValidation
import numpy as np

config = breast_cancer_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)

In [10]:
raw_data = data_processor.load_data()
data_1 = data_processor.impute_missing_values(raw_data)
data_2 = data_1.drop(columns=['Sample code number'])
data_3 = data_processor.encode_ordinal_features(data_2)
data_4 = data_processor.standardize_data(data_3,data_3,features=['Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses'])

In [11]:
data_4

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,0.206788,-0.699494,-0.742767,-0.632794,-0.549168,-0.706485,-0.179534,-0.611387,-0.343666,2
1,0.206788,0.283642,0.266684,0.768071,1.708882,1.792229,-0.179534,-0.283909,-0.343666,2
2,-0.503505,-0.699494,-0.742767,-0.632794,-0.549168,-0.428851,-0.179534,-0.611387,-0.343666,2
3,0.561934,1.594490,1.612618,-0.632794,-0.097558,0.126419,-0.179534,1.353485,-0.343666,2
4,-0.148359,-0.699494,-0.742767,0.067638,-0.549168,-0.706485,-0.179534,-0.611387,-0.343666,2
...,...,...,...,...,...,...,...,...,...,...
694,-0.503505,-0.699494,-0.742767,-0.632794,-0.097558,-0.428851,-0.999756,-0.611387,-0.343666,2
695,-0.858651,-0.699494,-0.742767,-0.632794,-0.549168,-0.706485,-0.999756,-0.611387,-0.343666,2
696,0.206788,2.249915,2.285586,0.067638,1.708882,-0.151216,1.871021,2.335921,0.239398,4
697,-0.148359,1.594490,0.939651,0.417854,-0.097558,0.126419,2.691243,1.026006,-0.343666,4


In [12]:
data_train, data_val = cross_validator.random_partition(data_4, random_state=42)

In [13]:
data_val = data_processor.encode_nominal_features(data_val)

In [14]:
data_test = data_val.to_numpy()
X_val = data_test[:,:-2]
y_val = data_test[:,-2:]

In [15]:
import random

iterations = 15

param_space = {
    'lr': [0.01,0.001,0.0001,0.00001,0.000001],
    'epochs': np.linspace(1000, 20000, num=20).astype(int).tolist()
}

best_score = float('inf')
best_params_linear = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        linear = LinearNetwork(config)

        _, val_losses = linear.logistic_regression(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=500)

        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop
        
    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_linear = params
        

print(f"Best parameters: {best_params_linear}, Best score: {best_score}")


Tested params: {'lr': 0.001, 'epochs': 3000}, Score: 0.09679556054785152
Tested params: {'lr': 1e-06, 'epochs': 11000}, Score: 0.11760015907610939
Tested params: {'lr': 0.0001, 'epochs': 19000}, Score: 0.08120419794483275
Tested params: {'lr': 0.01, 'epochs': 18000}, Score: 0.11034167774443762
Tested params: {'lr': 1e-05, 'epochs': 20000}, Score: 0.07915639845216639
Tested params: {'lr': 0.0001, 'epochs': 8000}, Score: 0.08122361785860831
Tested params: {'lr': 0.001, 'epochs': 11000}, Score: 0.09679498720293966
Tested params: {'lr': 1e-06, 'epochs': 14000}, Score: 0.10771810473704266
Tested params: {'lr': 0.01, 'epochs': 1000}, Score: 0.11034122412734101
Tested params: {'lr': 1e-06, 'epochs': 9000}, Score: 0.12753615690063147
Tested params: {'lr': 0.01, 'epochs': 11000}, Score: 0.11033965936946608
Tested params: {'lr': 0.01, 'epochs': 11000}, Score: 0.1103397932298309
Tested params: {'lr': 1e-06, 'epochs': 6000}, Score: 0.15365715739282887
Tested params: {'lr': 0.01, 'epochs': 9000}, S

In [16]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(1000, 20000, 2000).tolist(),
    'n_hidden': np.linspace(X_val.shape[1],5*X_val.shape[1],num=20).astype(int).tolist()
}

best_score = float('inf')
best_params_ffn = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=params['n_hidden'],n_hidden_2=params['n_hidden'],n_output=y_train.shape[1])

        _, val_losses, _ = ffn.train(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=500)

        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_ffn = params
        

print(f"Best parameters: {best_params_ffn}, Best score: {best_score}")


Skipping params: {'lr': 1e-06, 'epochs': 7000, 'n_hidden': 39} due to high score: 0.6419493248450852
Tested params: {'lr': 1e-06, 'epochs': 7000, 'n_hidden': 39}, Score: 0.6419493248450852
Skipping params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden': 27} due to high score: 0.6386984333448966
Tested params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden': 27}, Score: 0.6386984333448966
Tested params: {'lr': 1e-05, 'epochs': 5000, 'n_hidden': 29}, Score: 0.08194692698383096
Skipping params: {'lr': 1e-07, 'epochs': 5000, 'n_hidden': 31} due to high score: 0.6861643016724251
Tested params: {'lr': 1e-07, 'epochs': 5000, 'n_hidden': 31}, Score: 0.6861643016724251
Skipping params: {'lr': 1e-07, 'epochs': 5000, 'n_hidden': 41} due to high score: 0.6860510919757659
Tested params: {'lr': 1e-07, 'epochs': 5000, 'n_hidden': 41}, Score: 0.6860510919757659
Skipping params: {'lr': 1e-06, 'epochs': 3000, 'n_hidden': 39} due to high score: 0.6612321895052478
Tested params: {'lr': 1e-06, 'epochs': 3000, 'n_hidden

In [17]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(1000, 20000, 2000).tolist(),
    'n_encoder': np.arange(2,X_val.shape[1]-1,1).tolist()
}

best_score = float('inf')
best_params_auto = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=params['n_encoder'])

        losses = autoE.train(X_train, max_epochs=params['epochs'], lr=params['lr'])

        score = losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_auto = params
        

print(f"Best parameters: {best_params_auto}, Best score: {best_score}")


Skipping params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 7} due to high score: 1.0101647121849406
Tested params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 7}, Score: 1.0101647121849406
Skipping params: {'lr': 1e-07, 'epochs': 9000, 'n_encoder': 7} due to high score: 1.0106960978590305
Tested params: {'lr': 1e-07, 'epochs': 9000, 'n_encoder': 7}, Score: 1.0106960978590305
Tested params: {'lr': 0.0001, 'epochs': 5000, 'n_encoder': 6}, Score: 0.09806402884934455
Skipping params: {'lr': 0.0001, 'epochs': 9000, 'n_encoder': 3} due to high score: 0.20287381796934215
Tested params: {'lr': 0.0001, 'epochs': 9000, 'n_encoder': 3}, Score: 0.20287381796934215
Skipping params: {'lr': 1e-06, 'epochs': 1000, 'n_encoder': 4} due to high score: 1.010840020454489
Tested params: {'lr': 1e-06, 'epochs': 1000, 'n_encoder': 4}, Score: 1.010840020454489
Skipping params: {'lr': 1e-05, 'epochs': 7000, 'n_encoder': 4} due to high score: 0.3580866439154864
Tested params: {'lr': 1e-05, 'epochs': 7000, 'n_e

In [18]:
autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
losses[-1]

0.09248448445211395

In [19]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(1000, 20000, 2000).tolist(),
    'n_hidden_2': np.arange(2,X_val.shape[1]-1,1).tolist()
}

best_score = float('inf')
best_params_combined = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        combined = CombinedModel(autoE,n_hidden_2=params['n_hidden_2'],n_output=y_val.shape[1])

        _, val_losses, _ = combined.train(X_train,y_train,X_val,y_val,epochs=params['epochs'], lr=params['lr'],patience=500)


        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_combined = params
        

print(f"Best parameters: {best_params_combined}, Best score: {best_score}")


Tested params: {'lr': 1e-05, 'epochs': 13000, 'n_hidden_2': 4}, Score: 0.09524552305479583
Tested params: {'lr': 1e-05, 'epochs': 17000, 'n_hidden_2': 5}, Score: 0.08943902898977531
Skipping params: {'lr': 1e-06, 'epochs': 13000, 'n_hidden_2': 5} due to high score: 0.6343483148762368
Tested params: {'lr': 1e-06, 'epochs': 13000, 'n_hidden_2': 5}, Score: 0.6343483148762368
Tested params: {'lr': 0.0001, 'epochs': 9000, 'n_hidden_2': 7}, Score: 0.09147129630081938
Skipping params: {'lr': 1e-07, 'epochs': 15000, 'n_hidden_2': 5} due to high score: 0.6746543037651868
Tested params: {'lr': 1e-07, 'epochs': 15000, 'n_hidden_2': 5}, Score: 0.6746543037651868
Skipping params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 7} due to high score: 0.6315978948827793
Tested params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 7}, Score: 0.6315978948827793
Tested params: {'lr': 0.0001, 'epochs': 13000, 'n_hidden_2': 2}, Score: 0.0919710234722924
Tested params: {'lr': 0.0001, 'epochs': 13000, 'n_hidden_

## Model Performance ##

In [35]:
linear_scores = []
ffn_scores = []
combined_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):

    train_set = data_processor.encode_nominal_features(train_set)
    test_set = data_processor.encode_nominal_features(test_set)

    train_data = train_set.to_numpy()
    X_train = train_data[:,:-2]
    y_train = train_data[:,-2:]

    test_data = test_set.to_numpy()
    X_test = test_data[:,:-2]
    y_test = test_data[:,-2:]

    linear = LinearNetwork(config)
    _, linear_val_losses = linear.logistic_regression(X_train,y_train,X_test,y_test,epochs=best_params_linear['epochs'],lr=best_params_linear['lr'],patience=500)

    ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=best_params_ffn['n_hidden'],n_hidden_2=best_params_ffn['n_hidden'],n_output=y_train.shape[1])
    _, ffn_val_losses, _ = ffn.train(X_train,y_train,X_test,y_test,epochs=best_params_ffn['epochs'],lr=best_params_ffn['lr'],patience=500)

    autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
    losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
    combined = CombinedModel(autoE,n_hidden_2=best_params_combined['n_hidden_2'],n_output=y_test.shape[1])
    _, combined_val_losses, _ = combined.train(X_train,y_train,X_test,y_test,epochs=best_params_combined['epochs'], lr=best_params_combined['lr'],patience=500)


    linear_score = linear_val_losses[-1]
    ffn_score = ffn_val_losses[-1]
    combined_score = combined_val_losses[-1]
    
    linear_scores.append(linear_score)
    ffn_scores.append(ffn_score)
    combined_scores.append(combined_score)

avg_score_linear = np.mean(linear_scores)
avg_score_ffn = np.mean(ffn_scores)
avg_score_combined = np.mean(combined_scores)

print(f"Linear Model Tested params: {best_params_linear}, Average Score: {avg_score_linear}")
print(f"FFN Model Tested params: {best_params_ffn}, Average Score: {avg_score_ffn}")
print(f"Combined Model Tested params: {best_params_combined}, Average Score: {avg_score_combined}")

print(f"Linear Model Scores: {linear_scores}")
print(f"FFN Model Scores: {ffn_scores}")
print(f"Combined Model Scores: {combined_scores}")



Tested params: {'lr': 1e-05, 'epochs': 20000}, Average Score: 0.09566483027171763
Tested params: {'lr': 1e-05, 'epochs': 9000, 'n_hidden': 16}, Average Score: 0.09671512283573097
Tested params: {'lr': 1e-05, 'epochs': 17000, 'n_hidden_2': 5}, Average Score: 0.10181484506005731


In [38]:
print(f"Linear Model Tested params: {best_params_linear}, Average Score: {avg_score_linear}")
print(f"FFN Model Tested params: {best_params_ffn}, Average Score: {avg_score_ffn}")
print(f"Combined Model Tested params: {best_params_combined}, Average Score: {avg_score_combined}")

print(f"Linear Model Scores: {linear_scores}")
print(f"FFN Model Scores: {ffn_scores}")
print(f"Combined Model Scores: {combined_scores}")


Linear Model Tested params: {'lr': 1e-05, 'epochs': 20000}, Average Score: 0.09566483027171763
FFN Model Tested params: {'lr': 1e-05, 'epochs': 9000, 'n_hidden': 16}, Average Score: 0.09671512283573097
Combined Model Tested params: {'lr': 1e-05, 'epochs': 17000, 'n_hidden_2': 5}, Average Score: 0.10181484506005731
Linear Model Scores: [0.11848689379877886, 0.06855733122936343, 0.0914292595740916, 0.08845363592663304, 0.08788965255481519, 0.10184645296921152, 0.10972672240754154, 0.09235595708561394, 0.10226829522597905, 0.09563410194514825]
FFN Model Scores: [0.1214222374639577, 0.06857891292044042, 0.09670960713225625, 0.08908281584332961, 0.09379335933795958, 0.10085272479665836, 0.11127096356472, 0.08643346311188324, 0.10175157191111493, 0.09725557227498964]
Combined Model Scores: [0.12466373478670287, 0.073320915089253, 0.10039047958257587, 0.09566831503995853, 0.09366246710267205, 0.11513349672814732, 0.11584861014671317, 0.09133245161222694, 0.10776527968709641, 0.100362700825227

In [42]:
from scipy import stats

t_stat, p_val = stats.ttest_ind(linear_scores, ffn_scores)
print(f"Linear Model vs. FFN Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing Linear Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(linear_scores, combined_scores)
print(f"Linear Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing FFN Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(ffn_scores, combined_scores)
print(f"FFN Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

Linear Model vs. FFN Model: t-statistic = -0.1682820775506725, p-value = 0.8682378089240967
Linear Model vs. Combined Model: t-statistic = -0.9672393254197901, p-value = 0.34624170755070816
FFN Model vs. Combined Model: t-statistic = -0.7857394607428007, p-value = 0.44224118353419584


## ARCHIVED ##

In [21]:
# data = data_train.to_numpy()
# X_train = data[:,:-2]
# y_train = data[:,-2:]

In [22]:
# data_test = data_val.to_numpy()
# X_val = data_test[:,:-2]
# y_val = data_test[:,-2:]

In [23]:
# X_train.shape

In [24]:
# autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=5)

# autoE.train(X_train, max_epochs=20000, lr=0.0001)

In [25]:
# combined = CombinedModel(autoE,n_hidden_2=50,n_output=y_val.shape[1])

# loss, val_metrics, final_loss = combined.train(X_train,y_train,X_val,y_val,epochs=10000,lr=0.00001)

In [26]:
# np.min(loss)

In [27]:
# import matplotlib.pyplot as plt

# plt.plot(loss)
# plt.plot(val_metrics)

In [28]:
# ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=24,n_hidden_2=24,n_output=y_train.shape[1])

# loss, val_metrics, final_mse = ffn.train(X_train,y_train,X_val,y_val,5000,0.00001)

In [29]:
# plt.plot(loss)
# plt.plot(val_metrics)

In [30]:
# linear = LinearNetwork(config)

# losses, val_losses = linear.logistic_regression(X_train,y_train,X_val,y_val,epochs=1000,lr=0.001)

In [31]:
# plt.plot(losses)
# plt.plot(val_losses)

## Tuning ##