In [1]:
%autosave 300
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
%config Completer.use_jedi = False

Autosaving every 300 seconds


In [2]:
import os

os.chdir("../.")
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/soutrik-gpuvm/code/Users/Soutrik.Chowdhury/GeneCpGSequencing


In [15]:
from src.utils import prepare_data, dna2int, set_seed
import torch
from src.dataset import CPGDatasetPackPadding
from torch.utils.data import DataLoader
from src.model import CpGCounterAdvancedPackPadding
from src.trainer import (
    training_loop_pack_padded,
    validation_loop_pack_padded,
    train_model_pack_padded,
)
from src.utils import (
    predict_cpgs_from_dna_pack_padded,
    save_checkpoint,
    load_checkpoint,
    create_test_data
)
from src.tuner import objective, tune_hyperparameters

##### -------------------------------------------------------- Hyperparameters --------------------------------------------------------

In [4]:
vocab_size = len(dna2int)
batch_size = 16
embedding_dim = 64
hidden_size = 256
num_layers = 2
dropout = 0.2
learning_rate = 0.001
num_epochs = 100
weight_decay = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
stop_patience = 10

##### --------------------- Prepare data ---------------------

In [5]:
set_seed(42)
train_x, train_y = prepare_data("variable", 2048)
test_x, test_y = prepare_data("variable", 512)
# each sequence has a length of 128 and the total number of sequences is 2048 and 512 for train and test respectively
print(
    min(map(len, train_x)),
    max(map(len, train_x)),
    min(map(len, test_x)),
    max(map(len, test_x)),
)
print(len(train_x), len(test_x))

16 128 16 128
2048 512


In [6]:
# pytorch standard dataset
train_dataset = CPGDatasetPackPadding(train_x, train_y)
val_dataset = CPGDatasetPackPadding(test_x, test_y)

# each iteration of the dataset will return a list of sequences and a labels
x, y = next(iter(train_dataset))
print(x.shape, y.shape)

torch.Size([97]) torch.Size([])


In [7]:
# standard dataloader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=CPGDatasetPackPadding.collate_fn,
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=CPGDatasetPackPadding.collate_fn,
)

# each iteration of the dataloader will return a batch of sequences and labels
for x_batch, y_batch, lengths in train_dataloader:
    print(x_batch.shape, y_batch.shape, lengths.shape)
    break

torch.Size([16, 115]) torch.Size([16]) torch.Size([16])


##### ------------------- Model Training and Evaluation -------------------

In [8]:
new_model = CpGCounterAdvancedPackPadding(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    num_layers=num_layers,
    dropout=dropout,
)

print(new_model)

CpGCounterAdvancedPackPadding(
  (embedding): Embedding(6, 64, padding_idx=0)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (batch_norm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (relu): ReLU()
)


In [9]:
# Normal model training
trained_model = train_model_pack_padded(
    new_model,
    train_dataloader,
    val_dataloader,
    device,
    epochs=num_epochs,
    patience=stop_patience,
    save_path="best_cpg_model_advanced_packpad.pth",
    lr=learning_rate,
    weight_decay=weight_decay,
)

Starting training from scratch.
Epoch 1/100, Train Loss: 8.2485, Train MAE: 2.2632, Train RMSE: 2.8213, Val Loss: 6.0834, Val MAE: 1.9195, Val RMSE: 2.4287
New best validation loss: 6.0834 (previous best: inf)
Model checkpoint saved at models/best_cpg_model_advanced_packpad.pth
Epoch 2/100, Train Loss: 6.3996, Train MAE: 1.9678, Train RMSE: 2.4737, Val Loss: 5.8931, Val MAE: 1.8903, Val RMSE: 2.3775
New best validation loss: 5.8931 (previous best: 6.0834)
Model checkpoint saved at models/best_cpg_model_advanced_packpad.pth
Epoch 3/100, Train Loss: 6.2039, Train MAE: 1.9421, Train RMSE: 2.4372, Val Loss: 5.9874, Val MAE: 1.8995, Val RMSE: 2.3938
No improvement, patience left: 9
Epoch 4/100, Train Loss: 5.3221, Train MAE: 1.7435, Train RMSE: 2.2303, Val Loss: 4.8355, Val MAE: 1.6738, Val RMSE: 2.1552
New best validation loss: 4.8355 (previous best: 5.8931)
Model checkpoint saved at models/best_cpg_model_advanced_packpad.pth
Epoch 5/100, Train Loss: 1.9347, Train MAE: 0.9590, Train RMSE: 

In [12]:
# create a test dataset
test_dna, count_cpg = create_test_data()
print(test_dna)
print(count_cpg)

ACNTCANCNCNTNGCGANTNGGGNATGGGTTANNTNNGGGNCTTCGCTANTCATTTAANCTGCGATTGGNGNTGCCNTATTTNCGACAANCTGTGCACGCCTNCGNCTNA
6


In [14]:
# test prediction from the advanced model
predicted_cpgs = predict_cpgs_from_dna_pack_padded(
    "best_cpg_model_advanced_packpad.pth",
    test_dna,
    dna2int,
    embedding_dim,
    hidden_size,
    num_layers,
    dropout,
    device,
    model_class=CpGCounterAdvancedPackPadding,
)
print("Voila! The model is working perfectly")
print(f"DNA: {test_dna} \n🔹 Predicted CpG Count: {predicted_cpgs}")

Voila! The model is working perfectly
DNA: ACNTCANCNCNTNGCGANTNGGGNATGGGTTANNTNNGGGNCTTCGCTANTCATTTAANCTGCGATTGGNGNTGCCNTATTTNCGACAANCTGTGCACGCCTNCGNCTNA 
🔹 Predicted CpG Count: 5.85


##### -------------- Hyperparameter Tuning ------------------ #####

In [17]:
# Run Hyperparameter Tuning
best_hyperparams, trained_model = tune_hyperparameters(
    vocab_size,
    train_dataloader,
    val_dataloader,
    device,
    num_epochs,
    stop_patience,
    n_trials=7,
    save_best_model_path="best_cpg_model_optuna.pth",
    study_name="cpg_optuna",
)

[32m[I 2025-02-03 17:01:49,665][0m A new study created in memory with name: cpg_optuna[0m


[32m[I 2025-02-03 17:02:37,317][0m Trial 0 finished with value: 0.28130437107756734 and parameters: {'embedding_dim': 128, 'hidden_size': 512, 'num_layers': 1, 'learning_rate': 0.00010051188891410282, 'weight_decay': 0.0011020350108512757, 'dropout': 0.18977875104643538}. Best is trial 0 with value: 0.28130437107756734.[0m
[32m[I 2025-02-03 17:03:20,907][0m Trial 1 finished with value: 0.03693105326965451 and parameters: {'embedding_dim': 256, 'hidden_size': 128, 'num_layers': 1, 'learning_rate': 0.00038663647849379747, 'weight_decay': 0.0003332445708937284, 'dropout': 0.15203537828834462}. Best is trial 1 with value: 0.03693105326965451.[0m
[32m[I 2025-02-03 17:05:18,349][0m Trial 2 finished with value: 0.029716629622271284 and parameters: {'embedding_dim': 64, 'hidden_size': 512, 'num_layers': 3, 'learning_rate': 0.0004204614036578065, 'weight_decay': 0.0002772933970987665, 'dropout': 0.2335641535286226}. Best is trial 2 with value: 0.029716629622271284.[0m
[32m[I 2025-02-0

Best Hyperparameters: {'embedding_dim': 256, 'hidden_size': 256, 'num_layers': 2, 'learning_rate': 0.0012714184201257357, 'weight_decay': 0.0014472616603016003, 'dropout': 0.20122992106623538}
Starting training from scratch.
Epoch 1/100, Train Loss: 7.3238, Train MAE: 2.1066, Train RMSE: 2.6331, Val Loss: 5.0425, Val MAE: 1.8097, Val RMSE: 2.2140
New best validation loss: 5.0425 (previous best: inf)
Model checkpoint saved at models/best_cpg_model_optuna.pth
Epoch 2/100, Train Loss: 5.8061, Train MAE: 1.8643, Train RMSE: 2.3506, Val Loss: 5.1400, Val MAE: 1.8439, Val RMSE: 2.2400
No improvement, patience left: 9
Epoch 3/100, Train Loss: 5.4962, Train MAE: 1.8032, Train RMSE: 2.2798, Val Loss: 5.6980, Val MAE: 2.0648, Val RMSE: 2.3682
No improvement, patience left: 8
Epoch 4/100, Train Loss: 3.3540, Train MAE: 1.2412, Train RMSE: 1.7313, Val Loss: 7.0657, Val MAE: 2.2085, Val RMSE: 2.6395
No improvement, patience left: 7
Epoch 5/100, Train Loss: 1.0630, Train MAE: 0.7489, Train RMSE: 0.9

In [18]:
# test prediction from the advanced model
predicted_cpgs = predict_cpgs_from_dna_pack_padded(
    model_path="best_cpg_model_optuna.pth",
    dna_sequence=test_dna,
    dna2int=dna2int,
    embedding_dim=best_hyperparams["embedding_dim"],
    hidden_size=best_hyperparams["hidden_size"],
    num_layers=best_hyperparams["num_layers"],
    dropout=best_hyperparams["dropout"],
    device=device,
    model_class=CpGCounterAdvancedPackPadding,
)
print("Voila! The hyper tuned model is working perfectly")
print(f"DNA: {test_dna} \n🔹 Predicted CpG Count: {predicted_cpgs}")

Voila! The hyper tuned model is working perfectly
DNA: ACNTCANCNCNTNGCGANTNGGGNATGGGTTANNTNNGGGNCTTCGCTANTCATTTAANCTGCGATTGGNGNTGCCNTATTTNCGACAANCTGTGCACGCCTNCGNCTNA 
🔹 Predicted CpG Count: 5.67


################################################################## END OF SETUP ################################################################