In [138]:
import torch
import numpy as np
import torch.nn as nn
import sys
import os
project_root = os.path.abspath("..")  # Adjust if needed
import pytorch_lightning as pl
# Add the project root to sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

from src.models.pointNetVae import PointNetVAE
from src.utils.data_utils import *
from src.dataset_classes.pointDataset import *
from proteinshake.datasets import ProteinFamilyDataset
from proteinshake.tasks import LigandAffinityTask
import random
from torch.utils.data import Dataset, Subset
from src.utils.data_utils import *
from src.dataset_classes.graphDataset import *
from torch_geometric.nn import TopKPooling
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
dataset = ProteinFamilyDataset(root='../data').to_graph(eps = 8).pyg()
point_d = ProteinFamilyDataset(root='../data').to_point().torch()
dataset = load_graph_data(dataset)

In [91]:
dataset[0]

Data(x=[277, 21], edge_index=[2, 2720], edge_attr=[2720, 1])

In [42]:
one_hot_encode_seq(dataset[0][1]['protein']['sequence'], 500).shape

KeyError: 1

In [43]:
torch.nn.functional.one_hot(dataset[0][0].x, 21)

KeyError: 0

In [145]:
from torch_geometric.loader import DataLoader
from torch_geometric.nn import InnerProductDecoder

# dataset = [...]  # List of torch_geometric.data.Data objects (one per graph)
batch_size = 16
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
test_batch = next(iter(loader))

In [148]:
loader

<torch_geometric.loader.dataloader.DataLoader at 0x37d64afe0>

In [60]:
# test_batch

indices_with_mask_val = []
for i in range(batch_size):
    x_true_indices = test_batch.x[torch.where(test_batch.batch == i)[0]].argmax(dim = -1)
    x_true_indices = torch.nn.functional.pad(x_true_indices, (0,500 - x_true_indices.shape[0]), value=21)
    indices_with_mask_val.append(x_true_indices)

In [8]:
from torch_geometric.nn import GCNConv, dense_diff_pool, global_mean_pool, TopKPooling
hidden_dim = 32
latent_dim = 2
class encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, hidden_dim, cached=True) # cached only for transductive learning
        self.conv2 = GCNConv(hidden_dim ,2*hidden_dim)

        self.fc_mu = nn.Linear(2*hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(2*hidden_dim, latent_dim)
        

    def forward(self,x):
        x_f, x_edg, x_batch = x.x.unsqueeze(-1).float(), x.edge_index, x.batch
        x_f = self.conv1(x_f, x_edg)

        x_f = self.conv2(x_f, x_edg)
        pooled_x = global_mean_pool(x_f, x_batch)
        mu = self.fc_mu(pooled_x)
        logvar = self.fc_logvar(pooled_x)
        return mu, logvar

In [128]:
from src.models.graphVAE import GraphVAE
gvae = GraphVAE(16,torch.optim.Adam,{'lr':0.001}, k_pooling = 64, conv_hidden_dim=32)

In [129]:
test_out = gvae(test_batch)

In [130]:
test_out[0].shape

torch.Size([16, 16])

In [131]:
gvae.ELBO(test_batch,test_out[-1],test_out[1],test_out[2])

logit shape torch.Size([16, 500, 21])


(tensor(3.0737, grad_fn=<DivBackward0>),
 tensor(3.0735, grad_fn=<DivBackward0>),
 tensor(0.0002, grad_fn=<DivBackward0>))

In [117]:
test_out[-1][0].shape

torch.Size([16, 500, 21])

In [103]:
from torch_geometric.nn import global_mean_pool
global_mean_pool(test_batch.x, test_batch.batch).shape

torch.Size([16, 21])

In [153]:
idx_list = range(len(dataset))
subset_size = int(len(dataset)//10)
val_idx = random.sample(idx_list, subset_size)  # Get random subset
train_idx = list(set(idx_list) - set(val_idx))
train_dataloader = DataLoader(Subset(dataset, train_idx).dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(Subset(dataset, train_idx).dataset,batch_size=batch_size, shuffle=False)

In [154]:
latent_dim = 128
epochs = 30
lr = 0.0001
batch_size = 256

if torch.cuda.is_available():
    torch.cuda.current_device()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [168]:
from src.models.graphVAE import GraphVAE
gvae = GraphVAE(64,torch.optim.Adam,{'lr':0.001}, conv_hidden_dim=128)

In [169]:
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping
import pytorch_lightning as pl
optimizer = torch.optim.Adam
optimizer_param = {'lr':0.001}
trainer = pl.Trainer(max_epochs=epochs,
    accelerator="auto",
    devices="auto",
    logger=TensorBoardLogger(save_dir="logs/"))

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [170]:
trainer.fit(gvae, train_dataloader, val_dataloader)


  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | conv1     | GCNConv | 2.8 K  | train
1 | conv2     | GCNConv | 33.0 K | train
2 | fc_mu     | Linear  | 16.4 K | train
3 | fc_logvar | Linear  | 16.4 K | train
4 | fc1_dec   | Linear  | 33.3 K | train
5 | fc3_dec   | Linear  | 5.4 M  | train
6 | relu      | ReLU    | 0      | train
7 | soft      | Softmax | 0      | train
----------------------------------------------
5.5 M     Trainable params
0         Non-trainable params
5.5 M     Total params
21.954    Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined