In [10]:
import argparse


parser = argparse.ArgumentParser()

parser.add_argument(
    "--dataset",
    type=str,
    default="training_sequences_noC.csv",
    help="dataset file for training, a csv file",
)
parser.add_argument(
    "--random_seed", type=int, default=123, help="rando seed for reproducibility"
)
parser.add_argument(
    "--batch_size", type=int, default=64, help="the batch size to update network"
)
parser.add_argument(
    "--block_size",
    type=int,
    default=40,
    help="portion of characters used for training batch",
)
parser.add_argument(
    "--vocab_size",
    type=int,
    default=20,
    help="number of unique characters or words used used as input features",
)
parser.add_argument(
    "--embed_dim",
    type=int,
    default=100,
    help="number of embedding demensions aka dimension vectors to represent entire vocab as input features",
)
parser.add_argument(
    "--hidden_size",
    type=int,
    default=200,
    help="number of neurons to be projects from the input feature size",
)
parser.add_argument(
    "--num_layers", type=int, default=2, help="number of hidden layers"
)
parser.add_argument(
    "--output_size",
    type=int,
    default=20,
    help="number of unique characters or words as output features",
)
parser.add_argument(
    "--dropout_rate",
    type=float,
    default=0.5,
    help="probability of an element to be zeroed in dropout layer",
)
parser.add_argument(
    "--l2_reg",
    type=float,
    default=0.03,
    help="l2 regularization coefficient",
)
parser.add_argument(
    "--gamma",
    type=float,
    default=0.59,
    help="gamma for learning rate scheduler",
)
parser.add_argument(
    "--momentum",
    type=float,
    default=0.5,
    help="momentum for optimizer learning rate used to update weights",
)
parser.add_argument(
    "--alpha",
    type=float,
    default=0.79,
    help="alpha for optimizer learning rate used to update weights",
)
parser.add_argument(
    "--epsilon",
    type=float,
    default=1e-08,
    help="epsilon for optimizer learning rate used to update weights",
)
parser.add_argument(
    "--gradient_accumulation_steps",
    type=int,
    default=8,
    help="number of steps to accumulate gradients before performing a backward/update pass",
)
parser.add_argument(
    "--betas",
    type=float,
    default=(0.9, 0.999),
    help="beta1 for optimizer learning rate used to update weights",
)
parser.add_argument(
    "--step_size",
    type=float,
    default=50,
    help="step size for learning rate scheduler to decay learning rate",
)
parser.add_argument(
    "--weight_decay",
    type=float,
    default=0.001,
    help="weight decay for optimizer learning rate used to update weights",
)

parser.add_argument(
    "--num_epochs",
    type=int,
    default=50,
    help="number of epochs. Number of full passes through the training examples.",
)
parser.add_argument(
    "--learning_rate",
    type=float,
    default=0.01,
    help="learning rate for model training",
)

parser.add_argument(
    "--k_folds",
    type=int,
    default=2,
    help="cross validation folds for training",
)
parser.add_argument(
    "--loss_plot_name",
    type=str,
    default="loss.png",
    help="loss plot name to save the plot",
)
parser.add_argument(
    "--accuracy_plot_name",
    type=str,
    default="accuracy.png",
    help="accuracy plot name to save the plot",
)
parser.add_argument(
    "--sampled_text_name",
    type=str,
    default="sampled_text.txt",
    help="sampled text name to save the text",
)
parser.add_argument(
    "--mode",
    choices=["pretrain", "cross_validate", "finetune", "sample"],
    default="pretrain",
    help="Mode: pretrain, cross-validate, finetune, sample",
)
parser.add_argument(
    "--session_name",
    choices=["pretrain", "crossval", "finetune", "sample"],
    default="pretrain",
    help="Mode: pretrain, cross-validate, finetune, sample",
)
parser.add_argument(
    "--model_name",
    type=str,
    default="model",
    help="model name to save the model",
)
parser.add_argument(
    "--start_char",
    type=str,
    default="B",
    help="start character to begin sampling",
)
parser.add_argument(
    "--sample_len",
    type=int,
    default=100,
    help="number of sequences to sample training",
)
parser.add_argument(
    "--temp",
    type=float,
    default=1.0,
    help="temperature used to sample text",
)
parser.add_argument(
    "--checkpoint",
    type=str,
    # default=None,
    help="filename of the pretrained model to used for sampling if train=False",
)

args = parser.parse_args("")

In [11]:
import os
import random

import numpy as np
import torch

torch.manual_seed(args.random_seed)
torch.backends.cudnn.deterministic = True


def reset_random_seeds(seed):
    """
    reset random seeds for reproducibility
    :param seed: {int} random seed
    Returns:
        -None
    """
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)


reset_random_seeds(args.random_seed)


def load_text(file):
    """
    load text from a file
    :param file: {str} file of the text file to be read
    Returns:
        -text : {str} text
    """

    with open(file, "r", encoding="utf8") as f:
        text = f.read()
        # uppercase all text
        text = text.upper()
    return text


ROOT_DIR = "../data/training_sequences_noC.csv"
text = load_text(ROOT_DIR)

chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab = "".join(chars)


def string_to_int(text):
    """
    encode a given text/char into integers (pytorch tensors)
    :param text: {str} text to be encoded
    Returns:
        -tensor: {torch.tensor} encoded text
    """
    chars = sorted(list(set(text)))
    stoi = {cha: i for i, cha in enumerate(chars)}
    encode = [stoi[cha] for cha in text]
    tensor = torch.tensor(encode).long()
    return tensor


split_idx = int(0.8 * len(text))
train_data = text[:split_idx]
val_data = text[split_idx:]


def text_chunks(text):
    """
    create text chunks consisting of [text length] number of character
    each. They will then be used to construct input and targert text, both
    with [text length] number of elements.
    param: text: {str} text to be chunked
    Returns:
        -text_chunks: {list} list of text length
    """
    block_size = args.block_size + 1
    encoded_text = string_to_int(text)
    text_chunks = [
        encoded_text[i : i + block_size]
        for i in range(len(encoded_text) - block_size + 1)
    ]
    random.shuffle(text_chunks)
    return text_chunks


train_chunks = text_chunks(train_data)
val_chunks = text_chunks(val_data)


class TextDataset(torch.utils.data.Dataset):
    """
    create a dataset of text
    Attributes: chunks: {list} list of text chunks
    Returns:
        -inpt: {torch.tensor} input text
        -target: {torch.tensor} target text
    """

    def __init__(self, chunks):
        self.chunks = chunks

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        chunks = self.chunks[idx]
        inpt = chunks[:-1].long()
        target = chunks[1:].long()
        return inpt, target


train_dataset = TextDataset(train_chunks)
val_dataset = TextDataset(val_chunks)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)

val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True
)

In [12]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train(
    model,
    train_dl,
    optimizer,
    criterion,
    scheduler,
    weight_decay,
    batch_size,
    block_size,
):
    """
    train the model
    :param model: {torch.nn.Module} model to be trained
    :param train_dl: {torch.utils.data.DataLoader} training data loader
    :param optimizer: {torch.optim} optimizer
    :param criterion: {torch.nn} loss function
    :param scheduler: {torch.optim.lr_scheduler} learning rate scheduler
    :param weight_decay: {float} weight decay
    :param batch_size: {int} batch size
    :param block_size: {int} block size
    Returns:
        -train_loss: {float} training loss
        -train_acc: {float} training accuracy
    """

    model.train()
    train_running_loss = 0.0
    train_running_acc = 0

    text_batch, target_batch = next(iter(train_dl))
    # for text_batch, target_batch in train_dl:
    text_batch.to(device)
    target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    # forward pass
    hidden, cell = model.init_hidden(batch_size)
    for c in range(block_size):
        pred, hidden, cell = model(text_batch[:, c], hidden, cell)
        loss += criterion(pred, target_batch[:, c])

        # L2 regularization
        l2_loss = 0.0
        for param in model.parameters():
            l2_loss += torch.norm(param, p=2)
        loss += weight_decay * l2_loss

    # backward pass
    optimizer.zero_grad()
    loss.backward()
    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
    # update parameters
    optimizer.step()
    loss = loss.item() / block_size
    train_running_loss += loss
    # accuracy
    pred = torch.argmax(pred, dim=1)
    target = target_batch[:, c]
    acc = (pred == target).sum().item() / len(target)
    train_running_acc += acc
    scheduler.step()

    return train_running_loss, train_running_acc


def validate(model, val_dl, optimizer, criterion, batch_size, block_size):
    """
    validate the model
    :param model: {torch.nn.Module} model to be trained
    :param val_dl: {torch.utils.data.DataLoader} validation data loader
    :param optimizer: {torch.optim} optimizer
    :param criterion: {torch.nn} loss function
    :param batch_size: {int} batch size
    :param block_size: {int} block size
    Returns:
        -val_loss: {float} validation loss
        -val_acc: {float} validation accuracy
    """

    model.eval()
    with torch.no_grad():
        val_running_loss = 0.0
        val_running_acc = 0

        text_batch, target_batch = next(iter(val_dl))
        # for text_batch, target_batch in val_dl:
        text_batch.to(device)
        target_batch.to(device)
        optimizer.zero_grad()
        loss = 0
        # forward pass
        hidden, cell = model.init_hidden(batch_size)
        for c in range(block_size):
            pred, hidden, cell = model(text_batch[:, c], hidden, cell)
            loss += criterion(pred, target_batch[:, c])
        loss = loss.item() / block_size
        val_running_loss += loss
        # accuracy
        pred = torch.argmax(pred, dim=1)
        target = target_batch[:, c]
        acc = (pred == target).sum().item() / len(target)
        val_running_acc += acc

    return val_running_loss, val_running_acc

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class LSTM(nn.Module):
    """
    LSTM model
    """

    def __init__(
        self,
        vocab_size,
        embed_dim,
        hidden_size,
        num_layers,
        output_size,
        dropout_rate,
    ):
        """
        Initialize the model
        :param vocab_size {int}: size of vocabulary
        :param embed_dim {int}: embedding dimension
        :param hidden_size {int}: hidden size
        :param num_layers {int}: number of layers
        :param output_size {int}: output size
        :param dropout_rate {float}: dropout rate
        """

        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.fc1 = nn.Linear(hidden_size, output_size)

    def forward(self, character, hidden, cell):
        """
        Forward pass
        :param character: input character
        :param hidden: hidden state
        :param cell: cell state
        Returns:
            output (torch.Tensor): output
            hidden (torch.Tensor): hidden state
            cell (torch.Tensor): cell state
        """

        output = self.embedding(character).unsqueeze(
            1
        )  # reshape to batch_size * 1 * embed_dim
        output, (hidden, cell) = self.lstm(output, (hidden, cell))
        output = self.dropout(output)  # applying dropout to the output
        output = F.relu(output)
        output = self.fc1(output).reshape(
            output.size(0), -1
        )  # reshape to batch_size * output_size

        return output, hidden, cell

    def init_hidden(self, batch_size):
        """
        Initialize hidden state
        :param batch_size: batch size
        Returns:
            hidden (torch.Tensor): hidden state
            cell (torch.Tensor): cell state
        """
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        cell = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden, cell

In [None]:
import itertools
import os
import sys
import time

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from tqdm.auto import tqdm


def main():
    reset_random_seeds(args.random_seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print("\nDATA STATISTICS\n")
    print(f"Total samples: {len(text.split()):,}")
    print(f"Mean sample length: {np.mean([len(s) for s in text.split()]):.2f}")
    print(f"Standard deviation: {np.std([len(s) for s in text.split()]):.2f}")
    print(f"Max sample length: {np.max([len(s) for s in text.split()]):,}")
    print(f"Min sample length: {np.min([len(s) for s in text.split()]):,}")
    print(f"Vocabulary size: {len(chars):,}")
    print(f"Vocabulary: {vocab}")
    print("=" * 80)

    model = LSTM(
        vocab_size=args.vocab_size,
        embed_dim=args.embed_dim,
        hidden_size=args.hidden_size,
        num_layers=args.num_layers,
        output_size=args.vocab_size,
        dropout_rate=args.dropout_rate,
    )
    model.to(device)

    optimizer = optim.RMSprop(
        model.parameters(),
        lr=args.learning_rate,
        weight_decay=args.weight_decay,
        alpha=args.alpha,
        eps=args.epsilon,
        momentum=args.momentum,
        centered=False,
    )
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(
        optimizer, step_size=args.step_size, gamma=args.gamma
    )

    print("\nTRAINING INFO\n")
    print(f"Computing on: {device} device")
    print(f"Model architecture: {model}")
    print(f"Total samples: {len(text):,}")
    print(f"Total training samples: {len(train_data):,}")
    print(f"Total validation samples: {len(val_data):,}")
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params:,}")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad
    )
    print(f"Training parameters: {total_trainable_params:,}")
    print("=" * 80)

    print("\nCROSS VALIDATION")

    train_dl = train_dataloader
    train_loss, train_acc = [], []
    val_loss, val_acc = [], []

    k = args.k_folds
    kf = KFold(n_splits=k, shuffle=True)

    start_time = time.time()

    hidden = [100, 200]
    layers = [1, 2]
    dropout = [0.1, 0.2]

    hyperparam_combinations = list(
        itertools.product(layers, hidden, dropout)
    )
    total_combinations = len(hyperparam_combinations)
    total_fits = total_combinations * k

    print("\nHYPERPARAMETER SEARCH\n")
    print(f"Total folds: {k}")
    print(f"Total hyperparameter combinations: {total_combinations}")
    print(f"Total fits: {total_fits}")
    print("=" * 80)

    all_best_loss = np.inf
    all_best_epoch = 0
    all_best_hyperparams = {}

    for fold, (_, _) in enumerate(kf.split(train_dl.dataset)):
        best_hyperparams = {}
        fold_best_loss = np.inf
        fold_best_epoch = 0
        for combination in hyperparam_combinations:
            l, h, d = combination
            print(f"\nFold {fold + 1} of {k}.")
            print(
                f"Candidate {hyperparam_combinations.index(combination) + 1} of {total_combinations}."
            )
            print("Hyperparameters:")
            print(f"Layers: {l}, hidden: {h}, Dropout: {d}")

            model = LSTM(
                vocab_size=args.vocab_size,
                embed_dim=args.embed_dim,
                hidden_size=args.hidden_size,
                num_layers=l,
                output_size=args.vocab_size,
                dropout_rate=d,
            )

            model = model
            model = model.to(device)
            optimizer = optim.RMSprop(
                model.parameters(),
                lr=args.learning_rate,
                weight_decay=args.weight_decay,
                alpha=args.alpha,
                eps=args.epsilon,
                momentum=args.momentum,
                centered=False,
            )

            criterion = nn.CrossEntropyLoss()
            scheduler = optim.lr_scheduler.StepLR(
                optimizer, step_size=args.step_size, gamma=args.gamma
            )

            best_loss = np.inf
            best_epoch = 0
            # for step, (_, _) in enumerate(train_dataloader):
            for epoch in tqdm(
                range(args.num_epochs),
                desc="Epochs",
                unit="epoch",
                leave=True,
                position=0,
            ):
                train_epoch_loss, train_epoch_acc = train(
                    model=model,
                    train_dl=train_dataloader,
                    optimizer=optimizer,
                    criterion=criterion,
                    scheduler=scheduler,
                    weight_decay=args.weight_decay,
                    batch_size=args.batch_size,
                    block_size=args.block_size,
                )

                val_epoch_loss, val_epoch_acc = validate(
                    model=model,
                    val_dl=val_dataloader,
                    optimizer=optimizer,
                    criterion=criterion,
                    batch_size=args.batch_size,
                    block_size=args.block_size,
                )

                train_loss.append(train_epoch_loss)
                train_acc.append(train_epoch_acc)
                val_loss.append(val_epoch_loss)
                val_acc.append(val_epoch_acc)

                # print(
                #     f"Epoch: {epoch:03d} | "
                #     f"Val Acc : {val_epoch_acc:.3f} | "
                #     f"Val Loss: {val_epoch_loss:.3f}"
                # )

                # get the best hyperparameters
                if val_epoch_loss < best_loss:
                    best_loss = val_epoch_loss
                    best_epoch = epoch

            # save fold best loss
            if best_loss < fold_best_loss:
                fold_best_loss = best_loss
                fold_best_epoch = best_epoch
                best_hyperparams = {
                    "layers": l,
                    "hidden": h,
                    "dropout": d,
                }

            # Print the best loss
            print(f"Best Loss: {best_loss:.3f} at epoch: {best_epoch:.3f}")
            print(f"Time elapsed: {(time.time() - start_time) / 60:.2f} min")
            print("=" * 80)

        # save all best loss
        if fold_best_loss < all_best_loss:
            all_best_loss = fold_best_loss
            all_best_epoch = fold_best_epoch
            all_best_hyperparams = best_hyperparams

        # Print the best hyperparameters per epoch
        print("=" * 80)
        print(f"\nBest Fold {fold + 1} Hyperparameters:")
        print(best_hyperparams)
        print(f"Best Loss: {fold_best_loss:.3f} at epoch: {fold_best_epoch:.3f}")
        print("=" * 80)
        print("=" * 80)

    # Print the best hyperparameters for all folds
    print("=" * 80)
    # overall best hyperparameters
    print(f"\nBest Hyperparameters for {k} folds and {total_fits} fits :")
    print(all_best_hyperparams)
    print(f"Best Loss: {all_best_loss:.3f} at epoch: {all_best_epoch:.3f}")
    print(f"Time elapsed: {(time.time() - start_time)/60:.2f} min")
    print("=" * 80)
    print("=" * 80)
    print("=" * 80)

    # Save the best hyperparameters to a file
    with open("../reports/best_hyperparams.txt", "w") as f:
        f.write(f"\nBest Hyperparameters for {k} folds and {total_fits}fits:\n")
        f.write(str(all_best_hyperparams))
        f.write(f"\nBest Loss: {all_best_loss:.3f}")
        f.write(f"\nBest Epoch: {all_best_epoch:.3f}")
        f.write(f"\nTime elapsed: {(time.time() - start_time)/60:.2f} min")



if __name__ == "__main__":
    main()


DATA STATISTICS

Total samples: 1,554
Mean sample length: 20.82
Standard deviation: 7.72
Max sample length: 48
Min sample length: 7
Vocabulary size: 20
Vocabulary: 
ADEFGHIKLMNPQRSTVWY

TRAINING INFO

Computing on: cpu device
Model architecture: LSTM(
  (embedding): Embedding(20, 100)
  (lstm): LSTM(100, 200, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc1): Linear(in_features=200, out_features=20, bias=True)
)
Total samples: 33,913
Total training samples: 27,130
Total validation samples: 6,783
Total parameters: 569,220
Training parameters: 569,220

CROSS VALIDATION

HYPERPARAMETER SEARCH

Total folds: 2
Total hyperparameter combinations: 8
Total fits: 16

Fold 1 of 2.
Candidate 1 of 8.
Hyperparameters:
Layers: 1, hidden: 100, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.424 at epoch: 49.000
Time elapsed: 0.25 min

Fold 1 of 2.
Candidate 2 of 8.
Hyperparameters:
Layers: 1, hidden: 100, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.436 at epoch: 49.000
Time elapsed: 0.52 min

Fold 1 of 2.
Candidate 3 of 8.
Hyperparameters:
Layers: 1, hidden: 200, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.467 at epoch: 49.000
Time elapsed: 0.81 min

Fold 1 of 2.
Candidate 4 of 8.
Hyperparameters:
Layers: 1, hidden: 200, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.459 at epoch: 45.000
Time elapsed: 1.09 min

Fold 1 of 2.
Candidate 5 of 8.
Hyperparameters:
Layers: 2, hidden: 100, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.603 at epoch: 49.000
Time elapsed: 1.55 min

Fold 1 of 2.
Candidate 6 of 8.
Hyperparameters:
Layers: 2, hidden: 100, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.634 at epoch: 48.000
Time elapsed: 2.03 min

Fold 1 of 2.
Candidate 7 of 8.
Hyperparameters:
Layers: 2, hidden: 200, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.643 at epoch: 45.000
Time elapsed: 2.54 min

Fold 1 of 2.
Candidate 8 of 8.
Hyperparameters:
Layers: 2, hidden: 200, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.706 at epoch: 43.000
Time elapsed: 3.03 min

Best Fold 1 Hyperparameters:
{'layers': 1, 'hidden': 100, 'dropout': 0.1}
Best Loss: 1.424 at epoch: 49.000

Fold 2 of 2.
Candidate 1 of 8.
Hyperparameters:
Layers: 1, hidden: 100, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.518 at epoch: 43.000
Time elapsed: 3.33 min

Fold 2 of 2.
Candidate 2 of 8.
Hyperparameters:
Layers: 1, hidden: 100, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.397 at epoch: 47.000
Time elapsed: 3.59 min

Fold 2 of 2.
Candidate 3 of 8.
Hyperparameters:
Layers: 1, hidden: 200, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.484 at epoch: 49.000
Time elapsed: 3.87 min

Fold 2 of 2.
Candidate 4 of 8.
Hyperparameters:
Layers: 1, hidden: 200, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.455 at epoch: 46.000
Time elapsed: 4.15 min

Fold 2 of 2.
Candidate 5 of 8.
Hyperparameters:
Layers: 2, hidden: 100, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.597 at epoch: 49.000
Time elapsed: 4.65 min

Fold 2 of 2.
Candidate 6 of 8.
Hyperparameters:
Layers: 2, hidden: 100, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.583 at epoch: 45.000
Time elapsed: 5.15 min

Fold 2 of 2.
Candidate 7 of 8.
Hyperparameters:
Layers: 2, hidden: 200, Dropout: 0.1


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.510 at epoch: 44.000
Time elapsed: 5.67 min

Fold 2 of 2.
Candidate 8 of 8.
Hyperparameters:
Layers: 2, hidden: 200, Dropout: 0.2


Epochs:   0%|          | 0/50 [00:00<?, ?epoch/s]

Best Loss: 1.629 at epoch: 42.000
Time elapsed: 6.14 min

Best Fold 2 Hyperparameters:
{'layers': 1, 'hidden': 100, 'dropout': 0.2}
Best Loss: 1.397 at epoch: 47.000

Best Hyperparameters for 2 folds and 16 fits :
{'layers': 1, 'hidden': 100, 'dropout': 0.2}
Best Loss: 1.397 at epoch: 47.000
Time elapsed: 6.14 min
