In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
import wandb
import pandas as pd
import pytorch_lightning as pl
import torch
from pathlib import Path
from src.utils.io import HDFReader
from src.data.preprocess import preprocess
from src.data.modules.paralog import ParalogousGeneDataModule
from src.models.baseline import ConvolutionalModel
from pytorch_lightning.loggers import WandbLogger

In [47]:
# preprocess(
#     "data/genome/gff_file.gff",
#     "data/genome/fasta_file.fsa",
#     "data/embeddings",
#     "data/waern_2013",
#     "data/samples.json",
#     "data/processed",
#     500,
# )

In [73]:
summary_path = Path("data/processed/summary.csv")
h5_path = Path("data/processed/genewise.h5")

assert summary_path.exists()
assert h5_path.exists()

In [49]:
h5_reader = HDFReader("data/processed/genewise.h5")

In [50]:
genes = pd.read_csv(summary_path)["gene"].to_list()
genes[:5]

['YAL001C', 'YAL002W', 'YAL003W', 'YAL004W', 'YAL005C']

In [51]:
# gene_expression = {
#     gene: h5_reader[gene][1] for gene in genes
# }
# df_expression = pd.DataFrame(gene_expression).T


In [52]:
df_summary = pd.read_csv(summary_path)
df_summary.head()

Unnamed: 0,gene,coordinates,chromosome,strand,paralog_group
0,YAL001C,"[(147594, 151006), (151097, 151166)]",chrI,-,YAL001C
1,YAL002W,"[(143707, 147531)]",chrI,+,YAL002W
2,YAL003W,"[(142174, 142253), (142620, 143160)]",chrI,+,YAL003W
3,YAL004W,"[(140760, 141407)]",chrI,+,YAL004W
4,YAL005C,"[(139503, 141431)]",chrI,-,YKL073W


In [65]:
df_summary_subset = df_summary.sample(1000)

In [71]:
lr = 1e-3
batch_size = 8
weight_decay = 1e-4
pooling_type = "max"
average_window = False
n_folds = 5

wandb.finish()

model_output_dir = Path("models/baseline")
model_output_dir.mkdir(exist_ok=True, parents=True)

for fold in range(n_folds):
    dm = ParalogousGeneDataModule(h5_reader, df_summary, fold, batch_size=batch_size, prefetch_factor=20, n_folds=n_folds)
    train_loader = dm.train_dataloader()
    test_loader = dm.test_dataloader()

    model = ConvolutionalModel(
        pooling_type=pooling_type,
        learning_rate=lr,
        weight_decay=weight_decay,
    )
    wandb_logger = WandbLogger(
        project="RNA_prediction",
        name=(
            f"fold_{fold}, {pooling_type} convolution, average_window={average_window}, "
            f"lr={lr:.1e}, batch_size={batch_size}, weight_decay={weight_decay:.1e}"
        )
    )
    trainer = pl.Trainer(max_epochs=20, logger=wandb_logger)
    trainer.fit(model, train_loader, test_loader)

    # Save model
    torch.save(model.state_dict(), model_output_dir / f"fold_{fold}.pth")

    wandb.finish()

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | conv    | Conv1d  | 13.8 K | train
1 | loss_fn | MSELoss | 0      | train
--------------------------------------------
13.8 K    Trainable params
0         Non-trainable params
13.8 K    Total params
0.055     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/nico/Software/uni/ML4RG/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


0,1
epoch,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇██
train_loss,█▂▅▃▃▁▃▂▂▃▂▃▂▁▃▁▂▁▁▂▂▄▃▅▂▃▁▁▂▂▂▂▃▂▁▂▁▁▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇█████
val_correlation,▁▅▆▇█████▇█▇▇██▇▇▇▇▇
val_explained_variance,▁▆▇████████▇▇█▇▇▇▇▆▆
val_loss,█▄▃▂▂▁▁▃▁▂▁▄▃▁▂▂▂▂▃▃

0,1
epoch,19.0
train_loss,1.75645
trainer/global_step,13499.0
val_correlation,0.64294
val_explained_variance,0.40755
val_loss,0.37037


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | conv    | Conv1d  | 13.8 K | train
1 | loss_fn | MSELoss | 0      | train
--------------------------------------------
13.8 K    Trainable params
0         Non-trainable params
13.8 K    Total params
0.055     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/nico/Software/uni/ML4RG/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


0,1
epoch,▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇▇███████
train_loss,▄▄▄▃▃▃▄█▄█▂▂▆▃▄▅▃▂█▃▃▃▃▄▃▂▃▁█▅▂▆▁▁▁▂▄▆▅█
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇███
val_correlation,▁▅▇██████████▇▇▇▇▇▆▆
val_explained_variance,▁▆▇████████▇▇▇▇▆▇▇▆▆
val_loss,█▃▁▁▁▁▁▂▁▁▁▁▂▁▂▃▂▂▂▂

0,1
epoch,19.0
train_loss,0.77909
trainer/global_step,13559.0
val_correlation,0.62768
val_explained_variance,0.39056
val_loss,0.42284


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | conv    | Conv1d  | 13.8 K | train
1 | loss_fn | MSELoss | 0      | train
--------------------------------------------
13.8 K    Trainable params
0         Non-trainable params
13.8 K    Total params
0.055     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/nico/Software/uni/ML4RG/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train_loss,▅▂▁▆█▁▁▂▂▃▆▂▃▃▂▂▂▁▃▁▂▃▂▂▄▁▂▁▃▂▂▂▂▂▂▂▂▁▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
val_correlation,▁▅▅▇▇▇▇▇▇▇█▇▇▇███▇▇▇
val_explained_variance,▁▅▅▇▇▇▇▇▆▆█▇▇▇█▇█▅▆▆
val_loss,█▆▅▃▂▃▆▃▃▃▁▂▂▃▅▂▂▄▃▃

0,1
epoch,19.0
train_loss,0.1157
trainer/global_step,13319.0
val_correlation,0.58206
val_explained_variance,0.3135
val_loss,0.41529


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | conv    | Conv1d  | 13.8 K | train
1 | loss_fn | MSELoss | 0      | train
--------------------------------------------
13.8 K    Trainable params
0         Non-trainable params
13.8 K    Total params
0.055     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/nico/Software/uni/ML4RG/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss,▂▆▁▁▁▂▁▄▃▂▂▁▂▁▁▂▆▁█▃▂▁▂▁▁▅▃▂▆▂▃▂▃▂▁▁▂▂▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██
val_correlation,▁▄▆▆▇▇███▇█▇▇█▇▇▇▇▇▆
val_explained_variance,▁▅▆▆▇████▇██▇█▇▇▇▇▇▆
val_loss,█▄▂▃▂▁▁▁▁▂▂▂▁▁▂▂▂▂▂▃

0,1
epoch,19.0
train_loss,0.2049
trainer/global_step,13219.0
val_correlation,0.63629
val_explained_variance,0.40423
val_loss,0.44122


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



  | Name    | Type    | Params | Mode 
--------------------------------------------
0 | conv    | Conv1d  | 13.8 K | train
1 | loss_fn | MSELoss | 0      | train
--------------------------------------------
13.8 K    Trainable params
0         Non-trainable params
13.8 K    Total params
0.055     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/nico/Software/uni/ML4RG/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇█████
train_loss,▂▃▃▁▁▂▂▂█▁▂▃▂▃▂▂▂▂▂▁▂▃▃▂▂▁▁▄▃▃▂▅▂▂▇▂▂▁▂▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█
val_correlation,▁▅▇▇▇▇▇█████▇███▇▇█▇
val_explained_variance,▁▅▇▇▇▇▇████▇▇███▇▇▇▇
val_loss,█▃▂▂▂▁▂▁▁▁▁▂▁▂▂▁▂▂▂▄

0,1
epoch,19.0
train_loss,0.26036
trainer/global_step,13499.0
val_correlation,0.62266
val_explained_variance,0.38485
val_loss,0.40896
