In [1]:
import lightning as L
import torch
import os

In [2]:
from catalytic_function.model import MPNNDimRed, TwoChannelFFN, TwoChannelLinear
from catalytic_function.nn import BondMessagePassing, LinDimRed
from chemprop.nn import MeanAggregation
from catalytic_function.nn import DotSig

In [5]:
d_h_encoder = 20
pred_head = DotSig(input_dim=d_h_encoder * 2)
mp = BondMessagePassing(d_v=67, d_e=7, d_h=d_h_encoder, depth=1)
agg = MeanAggregation()
embed_dim = 1280
n_epochs = 4

model = MPNNDimRed(
        reduce_X_d=LinDimRed(d_in=embed_dim, d_out=d_h_encoder),
        message_passing=mp,
        agg=agg,
        predictor=pred_head,
    )

res_dir = "/projects/p30041/spn1560/hiec/artifacts/model_evals/gnn"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
chkpt_idx = 8
split_idx = 0
n_splits = 5
chkpt_dir = f"{res_dir}/{chkpt_idx}_hp_idx_split_{split_idx+1}_of_{n_splits}/version_0/checkpoints"
chkpt_file = os.listdir(chkpt_dir)[0]
chkpt_path = f"{chkpt_dir}/{chkpt_file}"
chkpt = torch.load(chkpt_path, map_location=device)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
chkpt = torch.load('/projects/p30041/spn1560/hiec/artifacts/model_evals/gnn/88_hp_idx_split_5_of_5/version_0/checkpoints/epoch=18-step=26790.ckpt')
chkpt['epoch']

18

In [6]:
trainer = L.Trainer(
    enable_checkpointing=False,
    enable_progress_bar=True,
    accelerator="auto",
    devices=1,
    max_epochs=0, # number of epochs to train for
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
trainer.fit(
    model,
    torch.DataLoader(),
)

AttributeError: module 'torch' has no attribute 'DataLoader'

In [None]:
model = Net()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()
# - or -
model.train()

In [18]:
checkpoint

{'epoch': 1,
 'global_step': 2,
 'pytorch-lightning_version': '2.2.5',
 'state_dict': OrderedDict([('message_passing.W_i.weight',
               tensor([[ 5.5946e-02, -5.2682e-02,  4.2552e-02,  ...,  6.1952e-02,
                        -5.2242e-02, -5.3086e-02],
                       [-2.1555e-02, -1.1180e-01,  7.3735e-02,  ..., -7.5020e-02,
                        -1.0554e-01, -7.9146e-02],
                       [ 6.0962e-02,  1.0429e-01, -2.6556e-02,  ..., -4.2485e-02,
                        -1.0189e-01, -5.6405e-02],
                       ...,
                       [ 4.1036e-02,  7.3391e-02, -8.3135e-02,  ..., -6.7459e-02,
                        -7.9029e-05, -4.1186e-02],
                       [-4.4437e-02,  3.6596e-02,  3.1334e-02,  ...,  3.8038e-02,
                         3.1880e-02,  3.9304e-02],
                       [ 1.1336e-01, -6.5759e-02,  6.2761e-02,  ...,  3.1353e-02,
                        -9.2325e-02,  7.6437e-02]])),
              ('message_passing.W_h.weigh

In [2]:
import pandas as pd
from src.cross_validation import BatchGridSearch, BatchScript, HyperHyperParams
from dataclasses import fields
from math import isnan

def fix_ints(hp_dict):
    to_fix = [
        'encoder_depth',
        'embed_dim',
        'seed',
        'n_epochs',
        'd_h_encoder',
        'n_splits',
        'neg_multiple',
    ]
    
    for elt in to_fix:
        if elt not in hp_dict:
            continue
        elif isnan(hp_dict[elt]):
            continue
        else:
            hp_dict[elt]  = int(hp_dict[elt])
    
    return hp_dict

# Args
allocation = 'b1039'
partition = 'b1039'
mem = '12G' # 12G
time = '12' # Hours 12
fit_script = 'two_channel_fit.py'
batch_script = BatchScript(allocation=allocation, partition=partition, mem=mem, time=time, script=fit_script)
res_dir = "/projects/p30041/spn1560/hiec/artifacts/model_evals/gnn"

# Old hp_idxs : total epochs to train up to
hp_idx_epochs = {
    # 29:50,
    # 28:50,
    # 45:50,
    8:50,
    68:50,
    # 46:50,
    # 47:50,
}

experiments = pd.read_csv(f"{res_dir}/experiments.csv", sep='\t', index_col=0)
to_resume = experiments.loc[hp_idx_epochs.keys()]
by = [field.name for field in fields(HyperHyperParams)]
gb = to_resume.groupby(by=by)
other_columns = [col for col in experiments.columns if col not in by]

# Chunk hps into groups w/ shared hyper hps
hhp_args = []
hps = []
chkpt_idxs = [] # Where to load model ckpt from
for hhp_vals, group in gb:
    hhp_args.append({k: v for k,v in zip(by, hhp_vals)})
    hp_chunk = []
    chckpt_chunk = []
    for hp_idx, row in group.iterrows():
        tmp = {col : row[col] for col in other_columns}
        tmp['n_epochs'] = hp_idx_epochs[hp_idx] # Resumed fit should have all same hps EXCEPT n_epochs
        hp_chunk.append(tmp)
        chckpt_chunk.append(hp_idx)

    chkpt_idxs.append(chckpt_chunk)
    hps.append(hp_chunk)

# # Run bsg
# for hhp, hp, chkpt in zip(hhp_args, hps, chkpt_idxs):
#     hp = [fix_ints(elt) for elt in hp]
#     hhp = fix_ints(hhp)
#     hhps = HyperHyperParams(**hhp)
#     gs = BatchGridSearch(hhps, res_dir)
#     gs.resume(hp, batch_script, chkpt)

In [19]:
mp = hps[0][0]['message_passing']

In [21]:
type(mp)

float

In [9]:
import os
from torch import optim, nn, utils, Tensor
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
import lightning as L

# define any number of nn.Modules (or use your current ones)
encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))


# define the LightningModule
class LitAutoEncoder(L.LightningModule):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, _ = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = nn.functional.mse_loss(x_hat, x)
        # Logging to TensorBoard (if installed) by default
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


# init the autoencoder
autoencoder = LitAutoEncoder(encoder, decoder)

In [10]:
# setup data
dataset = MNIST(os.getcwd(), download=True, transform=ToTensor())
train_loader = utils.data.DataLoader(dataset)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to /home/spn1560/hiec/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 53905870.54it/s]


Extracting /home/spn1560/hiec/MNIST/raw/train-images-idx3-ubyte.gz to /home/spn1560/hiec/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to /home/spn1560/hiec/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 1434696.08it/s]


Extracting /home/spn1560/hiec/MNIST/raw/train-labels-idx1-ubyte.gz to /home/spn1560/hiec/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to /home/spn1560/hiec/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 12994129.25it/s]


Extracting /home/spn1560/hiec/MNIST/raw/t10k-images-idx3-ubyte.gz to /home/spn1560/hiec/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to /home/spn1560/hiec/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 1988988.18it/s]


Extracting /home/spn1560/hiec/MNIST/raw/t10k-labels-idx1-ubyte.gz to /home/spn1560/hiec/MNIST/raw



In [11]:
# train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
trainer = L.Trainer(limit_train_batches=100, max_epochs=0)
trainer.fit(model=autoencoder, train_dataloaders=train_loader)

/home/spn1560/.conda/envs/hiec/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/spn1560/.conda/envs/hiec/lib/python3.11/site-p ...
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/spn1560/hiec/lightning_logs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)
/home/spn1560/.conda/envs/hiec/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_data