<h1> Import Libraries </h1>
We import all the necessary libraries, including Optuna, PyTorch, and other utilities.

In [13]:
# !conda install anaconda::mysql-python  

In [14]:
import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
import time
from tqdm import tqdm
from prettytable import PrettyTable

import mysql.connector

# Ensure reproducibility
torch.manual_seed(0)
np.random.seed(0)

<h1> Read Data </h1>
We read the data saved in `data_processing.ipynb`.

In [15]:
# Load data
X  = np.load('../../data/training_inputs_cart_numpy_binary_1024.npy') # Stick input as cartesian coordinates.
# X  = np.load('../data/training_inputs_polar_numpy_binary.npy') # Stick inputs as polar coordinates.
# X  = np.load('../data/training_inputs_cart_numpy_binary.npy') # Stick input as cartesian coordinates.
# X  = np.load('../data/training_inputs_polar_numpy_binary.npy') # Stick inputs as polar coordinates.
# Load labels
y  = np.load('../../data/labes_is_sheik_numpy_binary_1024.npy')
# Load labels
# y  = np.load('../data/labes_is_sheik_numpy_binary.npy')
# Print shape to make sure we have what we want.
print(X.shape)
# print(X)
print(y.shape)

(42768, 9, 1024)
(42768,)


<h1> Data Splitting </h1>

In [16]:
# Split data into training + validation and holdout sets
X_train_val, X_holdout, y_train_val, y_holdout = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Split training + validation set into separate training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42)  # 0.25 * 0.8 = 0.2
print(y_holdout.shape)
print(y_train.shape)
print(y_val.shape)

(8554,)
(25660,)
(8554,)


<h1> Data Loader </h1>

In [17]:
# Convert arrays into tensors and create dataset objects
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
holdout_dataset = TensorDataset(torch.tensor(X_holdout, dtype=torch.float32), torch.tensor(y_holdout, dtype=torch.float32))

# Create data loaders
num_batches = 32 # Can be tuned
num_workers = 1 # Can be tuned

batch_size = X.shape[0] // num_batches  # Can be tuned
print(batch_size)
# batch_size = 64  # Can be tuned

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
holdout_loader = DataLoader(holdout_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

1336


In [18]:
def calculate_accuracy(outputs, labels):
    # Apply sigmoid and threshold at 0.5
    # epsilon = 10 ** -44
    preds = torch.sigmoid(outputs) >= 0.5
    correct = (preds.squeeze().long() == labels.long()).float().sum()
    return correct / labels.shape[0]

# class myLoss(torch.nn.Module):

#     def __init__(self, pos_weight=1):
#       super().__init__()
#       self.pos_weight = pos_weight

#     def forward(self, input, target):
#       epsilon = 10 ** -44
#       input = input.sigmoid().clamp(epsilon, 1 - epsilon)

#       my_bce_loss = -1 * (self.pos_weight * target * torch.log(input)
#                           + (1 - target) * torch.log(1 - input))
#       add_loss = (target - 0.5) ** 2 * 4
#       mean_loss = (my_bce_loss * add_loss).mean()
#       return mean_loss

<h1> Define the Customizable Network </h1>
We define the neural network architecture. We'll use Optuna to suggest hyperparameters for convolutional layers, optional ReLU activation, max pooling layers, and linear layers.

In [19]:
class CustomNet(nn.Module):
    def __init__(self, trial):
        super(CustomNet, self).__init__()

        # Fixed dropout rate (not tuned by Optuna)
        dropout_rate = 0.5

        # Convolutional layers setup
        self.conv_layers = nn.ModuleList()
        self.activations = []
        self.poolings = []
        self.bns = nn.ModuleList()
        self.dropouts = nn.ModuleList()

        num_layers = trial.suggest_int(f"num_conv_layers", 3, 5)
        in_channels = 9  # Fixed input channel size

        
        
        ######################################################################################################
        # In length is 2 ** 10
        # Padding is set up so that the out length is always reduced by 1 / 2 ** out_length_reduction_exponent
        # The length of a kernel is: kernel + (dilation - 1) * (kernel_size - 1)
        # The max lenght of a kernel is 25 which is kernel_size = 7 and dilation = 4
        # The in lenght can never be less than 25
        # Since the in lenght is always a power of 2, the in lenght can be no less than 2 ** 5 = 32,
        # we need to make sure not to reduce the in lenght too much, we keep track of
        # how much we can still reduce the length by using length_reduction_power_left which is set to 5.
        ######################################################################################################
        length_reduction_exporent_remaining = 5
        in_length_exponent = 10
        for i in range(num_layers):  # Convolutional layers
            ###########################
            # In length is a power of 2
            ###########################
            if i == 0: 
                out_channels = trial.suggest_int(f"conv_{i}_out_channels", 9, 9 * 64, step=9)   # 9 * 64 = 576
                groups = 9
            else:
                out_channels = trial.suggest_int(f"conv_{i}_out_channels", 512, 512)
                groups = 1
            # kernel_size = trial.suggest_int(f"conv_{i}_kernel_size", 3, 7, step=2)
            k = trial.suggest_int(f"conv_{i}_kernel_size_power", 1, 5)  # can safely change 5 to be anything
            kernel_size = 2 * k + 1
            dilation = trial.suggest_int(f"conv_{i}_dilation", 1, 4)
            out_length_reduction_exponent = trial.suggest_int(f"conv_{i}_out_length_reduction_exponent", 0, min(2,length_reduction_exporent_remaining))
            # conv_stride_length_exponent = trial.suggest_int(f"conv_{i}_stride_length_exponent", 0, out_length_reduction_exponent)
            conv_stride_length_exponent = out_length_reduction_exponent
            # Keep track of how much reducing we still can do
            length_reduction_exporent_remaining -= out_length_reduction_exponent
            in_length_exponent -= out_length_reduction_exponent
            # Set stride
            stride = 2 ** conv_stride_length_exponent
            # Padding is chosen so that out length is a power of 2
            # there is a floor in the formula. If we want to use more than 2 for out_length_reduction_exponent, we neen do caluclate the cases
            if (conv_stride_length_exponent == 2) and (((dilation * k) % 2) == 1):
                padding = dilation * k - 1
            else:
                padding = dilation * k
                
            self.conv_layers.append(nn.Conv1d(in_channels, out_channels, kernel_size,stride, padding, dilation, groups))
            in_channels = out_channels  # Update in_channels for the next layer

            if conv_stride_length_exponent < out_length_reduction_exponent:
                pooling_type = trial.suggest_int(f"layer_{i}_pooling_type", 0, 1)    # 1: max, 0: avg
                pool_kernal_size_exponent = out_length_reduction_exponent - conv_stride_length_exponent
                if pooling_type == 1:
                    self.poolings.append(nn.MaxPool1d(2 ** pool_kernal_size_exponent))
                else:
                    self.poolings.append(nn.AvgPool1d(2 ** pool_kernal_size_exponent))
            else:
                self.poolings.append(None)    #   No pooling in current layer

            
            # Optional Batch Normalization
            use_bn = trial.suggest_categorical(f"conv_{i}_bn", [True, False])
            if use_bn:
                self.bns.append(nn.BatchNorm1d(in_channels))
            else:
                self.bns.append(None)

            # Optional ReLU activation
            use_activation = trial.suggest_categorical(f"conv_{i}_activation", [True, False])
            self.activations.append(use_activation)

        
        # Max pooling layer
        # The kernel can be a power of two, up to the in lenght
        # In length of the output will be 2 ** out_length_exponent
        # and lenght can be 1, 2, 4, 8, 16, 32
        
        kernel_exponent = trial.suggest_int(f"maxpool_kernel_exponent",length_reduction_exporent_remaining , in_length_exponent)
        kernel_size = 2 ** kernel_exponent
        in_length_exponent -= kernel_exponent
        
        self.pool1 = nn.MaxPool1d(kernel_size=kernel_size)
        
        
        # The length right now should be 2 ** in_length_exponent, so we can be exact in our first lineal layer
        self.fc1 = nn.Linear(out_channels * 2 ** in_length_exponent, trial.suggest_int("fc1_out_features", 32, 256))
        # self.fc1 = nn.LazyLinear(trial.suggest_int("fc1_out_features", 64, 256))
        self.fc1_dropout = nn.Dropout(dropout_rate)  # Dropout after fc1
        self.fc2 = nn.Linear(self.fc1.out_features, trial.suggest_int("fc2_out_features", 32, 128))
        self.fc2_dropout = nn.Dropout(dropout_rate)  # Dropout after fc2
        self.fc3 = nn.Linear(self.fc2.out_features, 1)  # Output layer with 1 unit for binary classification

    def forward(self, x):
        # Apply convolutional layers with optional ReLU and fixed dropout
        for i, conv_layer in enumerate(self.conv_layers):
            x = conv_layer(x)
            if self.bns[i]:
                x = self.bns[i](x)
            if self.poolings[i]:
                x = self.poolings[i](x)
            if self.activations[i]:
                x = F.relu(x)            

        # Optional max pooling after conv layers
        # if self.use_pool1:
        x = self.pool1(x)

        # Flatten for fully connected layers
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc1_dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc2_dropout(x)
        x = self.fc3(x)  # Output without activation for BCEWithLogitsLoss
        return x


<h1> Define the Objective Function </h1>
We define the objective function for Optuna, which involves training and validating the model with the suggested hyperparameters to minimize the validation loss.

In [20]:
def objective(trial):
    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize the model with hyperparameters suggested by Optuna
    model = CustomNet(trial).to(device)

    # print(f"Trial {trial.number}:")
    # print(model)

    # Load and prepare data (assuming X and y are already loaded)
    # Splitting, converting to TensorDataset, and DataLoader setup would go here

    # Define the optimizer and criterion
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    # optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)
    criterion = nn.BCEWithLogitsLoss()
    def train_epoch(model, dataloader, optimizer, criterion):
        model.train()
        running_loss = 0.0
        running_accuracy = 0.0
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            running_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)
            
        epoch_loss = running_loss / len(dataloader.dataset)
        epoch_accuracy = running_accuracy / len(dataloader.dataset)
        return epoch_loss, epoch_accuracy

    def validate_epoch(model, dataloader, criterion):
        model.eval()
        running_loss = 0.0
        running_accuracy = 0.0
        with torch.no_grad():
            for inputs, labels in dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                # loss = criterion(outputs.squeeze(), labels)
                loss = criterion(torch.sigmoid(outputs.squeeze()), labels)
                
                running_loss += loss.item() * inputs.size(0)
                running_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)
                
        epoch_loss = running_loss / len(dataloader.dataset)
        epoch_accuracy = running_accuracy / len(dataloader.dataset)
        return epoch_loss, epoch_accuracy

    def evaluate_holdout(model, dataloader, criterion):
        model.eval()  # Set model to evaluation mode
        running_loss = 0.0
        running_accuracy = 0.0
        with torch.no_grad():  # No gradients needed
            for inputs, labels in dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                
                running_loss += loss.item() * inputs.size(0)
                running_accuracy += calculate_accuracy(outputs, labels) * inputs.size(0)
                
        epoch_loss = running_loss / len(dataloader.dataset)
        epoch_accuracy = running_accuracy / len(dataloader.dataset)
        return epoch_loss, epoch_accuracy
                
    # Training loop with early stopping and tqdm progress bar
    patience = 10
    best_val_loss = float('inf')
    epochs_no_improve = 0
    epochs_overfit = 0
    epochs = 250
    min_delta = 0.001
    min_overfit = .3

    # Initialize tqdm progress bar
    pbar = tqdm(total=epochs, desc="Epochs", position=0, leave=True)

    for epoch in range(epochs):
        train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_accuracy = validate_epoch(model, val_loader, criterion)
        
        # Early Stopping check and progress bar update
        if (val_loss + min_delta) < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if abs(train_loss - val_loss) < min_overfit:
            epochs_overfit = 0
        else:
            epochs_overfit += 1

        # Update progress bar
        pbar.set_postfix_str(f"Training Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
        pbar.update(1)  # Move the progress bar by one epoch

        # Check early stopping condition
        if epochs_no_improve >= patience or epochs_overfit >= patience:
            # pbar.write(f'Early stopping triggered at epoch {epoch + 1}')
            pbar.close()  # Close the progress bar
            break

    # Evaluate model on holdout set after training is complete (if necessary)
    holdout_loss, holdout_accuracy = evaluate_holdout(model, holdout_loader, criterion)
    print(f'Holdout Loss: {holdout_loss:.4f}, Accuracy: {holdout_accuracy:.4f}')

    pbar.close()  # Ensure the progress bar is closed
    return best_val_loss


<h1> Define Callback Function </h1>
We define a callback function that will be called by the Optuna study after each trial. This function will check if the current trial has a better value than the previous best and, if so, will save its parameters.

In [21]:
def save_params_if_best(study, trial):
    if study.best_trial.number == trial.number:
        # Save the best parameters so far
        print(f"New best trial at trial {trial.number}: {trial.value}")


<h1> Run the Optimization </h1>
We create an Optuna study and then iterate the optimizer separately.

In [22]:
import pymysql
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()

current_datetime_string = current_datetime.strftime("%Y-%m-%d %H:%M:%S ")

study = optuna.create_study(study_name = current_datetime_string + "Classical CNN",
                            direction="minimize",
                            storage = "mysql+pymysql://root:MomentusPigs@localhost:3306/optuna_trials")


# storage_url = "mysql+mysqlconnector://optuna_user:your_password@localhost/optuna_db"
# study = optuna.create_study(study_name="your_study_name", storage=storage_url, load_if_exists=True)
# storage_url = "mysql+pymysql://root:MomentusPigs@localhost:33060/optuna_trials"
# study = optuna.create_study(direction="minimize", storage=storage_url)

# !optuna-dashboard mysql+pymysql://root:MomentusPigs@localhost/optuna_trials


[I 2024-02-19 09:43:53,067] A new study created in RDB with name: 2024-02-19 09:43:53 Classical CNN


In [23]:
study.optimize(objective, n_trials=1000, show_progress_bar=True, timeout=3600*10, callbacks=[save_params_if_best])

# Print the overall best hyperparameters
print("Best trial overall:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


  0%|          | 0/1000 [00:00<?, ?it/s]

Epochs:   3%|▎         | 7/250 [07:44<42:40, 10.54s/it, Training Loss: 0.6621, Accuracy: 0.6182, Validation Loss: 0.7215, Accuracy: 0.6002]

[W 2024-02-19 09:43:53,202] Trial 0 failed with parameters: {'num_conv_layers': 5} because of the following error: TypeError("Trial.suggest_int() missing 1 required positional argument: 'high'").
Traceback (most recent call last):
  File "c:\Users\jaspa\.conda\envs\pytorch\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\grant\AppData\Local\Temp\ipykernel_35800\1826214111.py", line 6, in objective
    model = CustomNet(trial).to(device)
            ^^^^^^^^^^^^^^^^
  File "C:\Users\grant\AppData\Local\Temp\ipykernel_35800\1479774075.py", line 37, in __init__
    out_channels = trial.suggest_int(f"conv_{i}_out_channels", 9 * 64, step=9)   # 9 * 64 = 576
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jaspa\.conda\envs\pytorch\Lib\site-packages\optuna\_convert_positional_args.py", line 83, in converter_wrapper
    return func(**kwargs)


TypeError: Trial.suggest_int() missing 1 required positional argument: 'high'

In [None]:
def count_parameters(model):
    table = PrettyTable(['Modules', 'Parameters'])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params+=params
    print(table)
    print(f'Total Trainable Params: {total_params}')
    return total_params

count_parameters(CustomNet(study.best_trial))
