In [1]:
from src.data_preprocessor import DataProcessor
from data_configs.configs import *
from models.neural_networks import *
from src.cross_validation import CrossValidation
import numpy as np

config = breast_cancer_config
data_processor = DataProcessor(config=config)
cross_validator = CrossValidation(config=config)

In [2]:
raw_data = data_processor.load_data()
data_1 = data_processor.impute_missing_values(raw_data)
data_2 = data_1.drop(columns=['Sample code number'])
data_3 = data_processor.encode_ordinal_features(data_2)
data_4 = data_processor.standardize_data(data_3,data_3,features=['Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [3]:
data_4

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,0.206788,-0.699494,-0.742767,-0.632794,-0.549168,-0.706485,-0.179534,-0.611387,-0.343666,2
1,0.206788,0.283642,0.266684,0.768071,1.708882,1.792229,-0.179534,-0.283909,-0.343666,2
2,-0.503505,-0.699494,-0.742767,-0.632794,-0.549168,-0.428851,-0.179534,-0.611387,-0.343666,2
3,0.561934,1.594490,1.612618,-0.632794,-0.097558,0.126419,-0.179534,1.353485,-0.343666,2
4,-0.148359,-0.699494,-0.742767,0.067638,-0.549168,-0.706485,-0.179534,-0.611387,-0.343666,2
...,...,...,...,...,...,...,...,...,...,...
694,-0.503505,-0.699494,-0.742767,-0.632794,-0.097558,-0.428851,-0.999756,-0.611387,-0.343666,2
695,-0.858651,-0.699494,-0.742767,-0.632794,-0.549168,-0.706485,-0.999756,-0.611387,-0.343666,2
696,0.206788,2.249915,2.285586,0.067638,1.708882,-0.151216,1.871021,2.335921,0.239398,4
697,-0.148359,1.594490,0.939651,0.417854,-0.097558,0.126419,2.691243,1.026006,-0.343666,4


In [4]:
data_train, data_val = cross_validator.random_partition(data_4, random_state=42)

In [5]:
data_val = data_processor.encode_nominal_features(data_val)

In [6]:
data_test = data_val.to_numpy()
X_val = data_test[:,:-2]
y_val = data_test[:,-2:]

In [7]:
import random

iterations = 15

param_space = {
    'lr': [0.01,0.001,0.0001,0.00001,0.000001],
    'epochs': np.linspace(1000, 20000, num=20).astype(int).tolist()
}

best_score = float('inf')
best_params_linear = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        linear = LinearNetwork(config)

        _, val_losses = linear.logistic_regression(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=500)

        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop
        
    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_linear = params
        

print(f"Best parameters: {best_params_linear}, Best score: {best_score}")


Tested params: {'lr': 1e-05, 'epochs': 18000}, Score: 0.07918000111135785
Tested params: {'lr': 1e-06, 'epochs': 4000}, Score: 0.18913727945278405
Tested params: {'lr': 1e-06, 'epochs': 5000}, Score: 0.16839151626177334
Tested params: {'lr': 1e-05, 'epochs': 1000}, Score: 0.12206445143418995
Tested params: {'lr': 0.001, 'epochs': 5000}, Score: 0.09679476797677339
Tested params: {'lr': 0.0001, 'epochs': 6000}, Score: 0.08121199357103256
Tested params: {'lr': 0.001, 'epochs': 18000}, Score: 0.09678505557768542
Tested params: {'lr': 0.01, 'epochs': 4000}, Score: 0.1103418767293333
Tested params: {'lr': 0.01, 'epochs': 3000}, Score: 0.11033846231402203
Tested params: {'lr': 0.0001, 'epochs': 5000}, Score: 0.08119164057885364
Tested params: {'lr': 0.01, 'epochs': 17000}, Score: 0.11034082098896121
Tested params: {'lr': 0.0001, 'epochs': 1000}, Score: 0.08055305914197848
Tested params: {'lr': 1e-06, 'epochs': 11000}, Score: 0.11752301620999712
Tested params: {'lr': 1e-06, 'epochs': 15000}, S

In [8]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(1000, 20000, 2000).tolist(),
    'n_hidden': np.linspace(X_val.shape[1],5*X_val.shape[1],num=20).astype(int).tolist()
}

best_score = float('inf')
best_params_ffn = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=params['n_hidden'],n_hidden_2=params['n_hidden'],n_output=y_train.shape[1])

        _, val_losses, _ = ffn.train(X_train,y_train,X_val,y_val,epochs=params['epochs'],lr=params['lr'],patience=500)

        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_ffn = params
        

print(f"Best parameters: {best_params_ffn}, Best score: {best_score}")


Skipping params: {'lr': 1e-05, 'epochs': 3000, 'n_hidden': 18} due to high score: 0.6272058086162202
Tested params: {'lr': 1e-05, 'epochs': 3000, 'n_hidden': 18}, Score: 0.6272058086162202
Tested params: {'lr': 0.0001, 'epochs': 1000, 'n_hidden': 16}, Score: 0.07799183867071817
Skipping params: {'lr': 1e-07, 'epochs': 3000, 'n_hidden': 35} due to high score: 0.6888789387687329
Tested params: {'lr': 1e-07, 'epochs': 3000, 'n_hidden': 35}, Score: 0.6888789387687329
Skipping params: {'lr': 1e-06, 'epochs': 11000, 'n_hidden': 41} due to high score: 0.631694338789376
Tested params: {'lr': 1e-06, 'epochs': 11000, 'n_hidden': 41}, Score: 0.631694338789376
Skipping params: {'lr': 1e-07, 'epochs': 11000, 'n_hidden': 29} due to high score: 0.6788966268274266
Tested params: {'lr': 1e-07, 'epochs': 11000, 'n_hidden': 29}, Score: 0.6788966268274266
Skipping params: {'lr': 1e-06, 'epochs': 7000, 'n_hidden': 12} due to high score: 0.6438606983164123
Tested params: {'lr': 1e-06, 'epochs': 7000, 'n_hid

In [9]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(1000, 20000, 2000).tolist(),
    'n_encoder': np.arange(2,X_val.shape[1]-1,1).tolist()
}

best_score = float('inf')
best_params_auto = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=params['n_encoder'])

        losses = autoE.train(X_train, max_epochs=params['epochs'], lr=params['lr'])

        score = losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_auto = params
        

print(f"Best parameters: {best_params_auto}, Best score: {best_score}")


Skipping params: {'lr': 1e-07, 'epochs': 3000, 'n_encoder': 4} due to high score: 1.011108004191568
Tested params: {'lr': 1e-07, 'epochs': 3000, 'n_encoder': 4}, Score: 1.011108004191568
Skipping params: {'lr': 1e-06, 'epochs': 19000, 'n_encoder': 6} due to high score: 0.4728657953227713
Tested params: {'lr': 1e-06, 'epochs': 19000, 'n_encoder': 6}, Score: 0.4728657953227713
Skipping params: {'lr': 1e-07, 'epochs': 11000, 'n_encoder': 5} due to high score: 1.01066547851833
Tested params: {'lr': 1e-07, 'epochs': 11000, 'n_encoder': 5}, Score: 1.01066547851833
Skipping params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 4} due to high score: 1.010321610686602
Tested params: {'lr': 1e-07, 'epochs': 17000, 'n_encoder': 4}, Score: 1.010321610686602
Skipping params: {'lr': 1e-07, 'epochs': 1000, 'n_encoder': 7} due to high score: 1.011273146741658
Tested params: {'lr': 1e-07, 'epochs': 1000, 'n_encoder': 7}, Score: 1.011273146741658
Skipping params: {'lr': 1e-07, 'epochs': 3000, 'n_encoder':

In [10]:
autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
losses[-1]

0.1706938365331789

In [11]:
import random

iterations = 15

param_space = {
    'lr': [0.0001,0.00001,0.000001,0.0000001],
    'epochs': np.arange(1000, 20000, 2000).tolist(),
    'n_hidden_2': np.arange(X_val.shape[1]+1, 60, 15)
}

best_score = float('inf')
best_params_combined = {}

for _ in range(iterations):

    # Randomly select parameters
    params = {key: random.choice(value) for key, value in param_space.items()}
    scores = []

    for i, (train_set, _) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):
    
        train_set = data_processor.encode_nominal_features(train_set)

        train_data = train_set.to_numpy()
        X_train = train_data[:,:-2]
        y_train = train_data[:,-2:]

        combined = CombinedModel(autoE,n_hidden_2=params['n_hidden_2'],n_output=y_val.shape[1])

        _, val_losses, _ = combined.train(X_train,y_train,X_val,y_val,epochs=params['epochs'], lr=params['lr'],patience=500)


        score = val_losses[-1]
        scores.append(score)

        # Skip to the next parameter set if score > 0.2
        if score > 0.2:
            print(f"Skipping params: {params} due to high score: {score}")
            break  # Exit the current for-loop

    avg_score = np.mean(scores)

    print(f"Tested params: {params}, Score: {avg_score}")
    
    if avg_score < best_score:
        best_score = avg_score
        best_params_combined = params
        

print(f"Best parameters: {best_params_combined}, Best score: {best_score}")


Skipping params: {'lr': 1e-05, 'epochs': 3000, 'n_hidden_2': 40} due to high score: 0.6185642916115307
Tested params: {'lr': 1e-05, 'epochs': 3000, 'n_hidden_2': 40}, Score: 0.6185642916115307
Tested params: {'lr': 0.0001, 'epochs': 3000, 'n_hidden_2': 25}, Score: 0.0909688673222744
Tested params: {'lr': 0.0001, 'epochs': 11000, 'n_hidden_2': 10}, Score: 0.09024419182337626
Skipping params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 25} due to high score: 0.6313254634054578
Tested params: {'lr': 1e-06, 'epochs': 17000, 'n_hidden_2': 25}, Score: 0.6313254634054578
Skipping params: {'lr': 1e-06, 'epochs': 13000, 'n_hidden_2': 55} due to high score: 0.6333929824355566
Tested params: {'lr': 1e-06, 'epochs': 13000, 'n_hidden_2': 55}, Score: 0.6333929824355566
Skipping params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden_2': 40} due to high score: 0.6390195697651001
Tested params: {'lr': 1e-06, 'epochs': 9000, 'n_hidden_2': 40}, Score: 0.6390195697651001
Skipping params: {'lr': 1e-06, 'epochs':

## Model Performance ##

In [12]:
linear_scores = []
ffn_scores = []
combined_scores = []

for i, (train_set, test_set) in enumerate(cross_validator.cross_validation(data_train, n_splits=2, n_repeats=5, random_state=42, stratify=True)):

    train_set = data_processor.encode_nominal_features(train_set)
    test_set = data_processor.encode_nominal_features(test_set)

    train_data = train_set.to_numpy()
    X_train = train_data[:,:-2]
    y_train = train_data[:,-2:]

    test_data = test_set.to_numpy()
    X_test = test_data[:,:-2]
    y_test = test_data[:,-2:]

    linear = LinearNetwork(config)
    _, linear_val_losses = linear.logistic_regression(X_train,y_train,X_test,y_test,epochs=best_params_linear['epochs'],lr=best_params_linear['lr'],patience=np.inf)

    ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=best_params_ffn['n_hidden'],n_hidden_2=best_params_ffn['n_hidden'],n_output=y_train.shape[1])
    _, ffn_val_losses, _ = ffn.train(X_train,y_train,X_test,y_test,epochs=best_params_ffn['epochs'],lr=best_params_ffn['lr'],patience=np.inf)

    autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=best_params_auto['n_encoder'])
    losses = autoE.train(X_train, max_epochs=best_params_auto['epochs'], lr=best_params_auto['lr'])
    combined = CombinedModel(autoE,n_hidden_2=best_params_combined['n_hidden_2'],n_output=y_test.shape[1])
    _, combined_val_losses, _ = combined.train(X_train,y_train,X_test,y_test,epochs=best_params_combined['epochs'], lr=best_params_combined['lr'],patience=np.inf)


    linear_score = linear_val_losses[-1]
    ffn_score = ffn_val_losses[-1]
    combined_score = combined_val_losses[-1]
    
    linear_scores.append(linear_score)
    ffn_scores.append(ffn_score)
    combined_scores.append(combined_score)

avg_score_linear = np.mean(linear_scores)
avg_score_ffn = np.mean(ffn_scores)
avg_score_combined = np.mean(combined_scores)

print(f"Linear Model Tested params: {best_params_linear}, Average Score: {avg_score_linear}")
print(f"FFN Model Tested params: {best_params_ffn}, Average Score: {avg_score_ffn}")
print(f"Combined Model Tested params: {best_params_combined}, Average Score: {avg_score_combined}")

print(f"Linear Model Scores: {linear_scores}")
print(f"FFN Model Scores: {ffn_scores}")
print(f"Combined Model Scores: {combined_scores}")



Linear Model Tested params: {'lr': 1e-05, 'epochs': 18000}, Average Score: 0.09809413994911045
FFN Model Tested params: {'lr': 1e-05, 'epochs': 19000, 'n_hidden': 12}, Average Score: 0.1030250682605847
Combined Model Tested params: {'lr': 0.0001, 'epochs': 11000, 'n_hidden_2': 10}, Average Score: 0.130989453441213
Linear Model Scores: [0.1185287374993094, 0.06864945211059292, 0.09171165434482872, 0.08849421768965238, 0.08810409285273223, 0.11793364841383407, 0.11311571399844014, 0.09297990841031414, 0.10225246500658963, 0.09917150916481103]
FFN Model Scores: [0.12444202753050522, 0.0642392203406292, 0.09724921209556625, 0.08712223912576633, 0.08892320689871766, 0.15386638773255762, 0.11090618101110572, 0.09241723607885126, 0.10756072215220035, 0.10352424963994729]
Combined Model Scores: [0.1771090263377562, 0.10981703045794589, 0.09633452171992318, 0.10861374350258855, 0.09062366962739594, 0.20350910669208547, 0.15949422890460724, 0.09690454514275289, 0.14452946184303198, 0.12295920018

In [14]:
from scipy import stats

t_stat, p_val = stats.ttest_ind(linear_scores, ffn_scores)
print(f"Linear Model vs. FFN Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing Linear Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(linear_scores, combined_scores)
print(f"Linear Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

# Comparing FFN Model vs. Combined Model
t_stat, p_val = stats.ttest_ind(ffn_scores, combined_scores)
print(f"FFN Model vs. Combined Model: t-statistic = {t_stat}, p-value = {p_val}")

Linear Model vs. FFN Model: t-statistic = -0.5434423049075543, p-value = 0.5934925289940808
Linear Model vs. Combined Model: t-statistic = -2.5010920470186218, p-value = 0.022257435969579537
FFN Model vs. Combined Model: t-statistic = -1.9443903676965, p-value = 0.0676453571673575


In [27]:
from scipy import stats
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Combine all scores into a single array, and create an array of labels
scores = np.concatenate([linear_scores, ffn_scores, combined_scores])
labels = ['Linear'] * len(linear_scores) + ['FFN'] * len(ffn_scores) + ['Combined'] * len(combined_scores)

# Conduct ANOVA
anova_result = stats.f_oneway(linear_scores, ffn_scores, combined_scores)
print(f"ANOVA result: F-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}")

# If ANOVA shows significant differences, conduct post-hoc testing with Tukey's HSD
if anova_result.pvalue < 0.05:
    tukey = pairwise_tukeyhsd(endog=scores, groups=labels, alpha=0.05)
    print(tukey)


ANOVA result: F-statistic = 4.08609733111498, p-value = 0.02816553445288122
 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower   upper  reject
------------------------------------------------------
Combined    FFN   -0.028 0.0803 -0.0587  0.0028  False
Combined Linear  -0.0329 0.0344 -0.0637 -0.0021   True
     FFN Linear  -0.0049 0.9169 -0.0357  0.0258  False
------------------------------------------------------


## ARCHIVED ##

In [15]:
# data = data_train.to_numpy()
# X_train = data[:,:-2]
# y_train = data[:,-2:]

In [16]:
# data_test = data_val.to_numpy()
# X_val = data_test[:,:-2]
# y_val = data_test[:,-2:]

In [17]:
# X_train.shape

In [18]:
# autoE = AutoEncoder(config,n_input=X_train.shape[1],n_encoder=5)

# autoE.train(X_train, max_epochs=20000, lr=0.0001)

In [19]:
# combined = CombinedModel(autoE,n_hidden_2=50,n_output=y_val.shape[1])

# loss, val_metrics, final_loss = combined.train(X_train,y_train,X_val,y_val,epochs=10000,lr=0.00001)

In [20]:
# np.min(loss)

In [21]:
# import matplotlib.pyplot as plt

# plt.plot(loss)
# plt.plot(val_metrics)

In [22]:
# ffn = FeedForwardNetwork(config,n_input=X_train.shape[1],n_hidden_1=24,n_hidden_2=24,n_output=y_train.shape[1])

# loss, val_metrics, final_mse = ffn.train(X_train,y_train,X_val,y_val,5000,0.00001)

In [23]:
# plt.plot(loss)
# plt.plot(val_metrics)

In [24]:
# linear = LinearNetwork(config)

# losses, val_losses = linear.logistic_regression(X_train,y_train,X_val,y_val,epochs=1000,lr=0.001)

In [25]:
# plt.plot(losses)
# plt.plot(val_losses)

## Tuning ##