In [36]:
import hydra
from glob import  glob
from proteoscope.data import ProteoscopeDM
from proteoscope.modules import ProteoclipLM
import numpy as np
from omegaconf import OmegaConf


BASE_DIR = "/home/ec2-user/outputs/proteoclip/2023-09-20/01-06-01" # Best proteoscope - ESM-full
# BASE_DIR = "/home/ec2-user/outputs/proteoclip/2023-09-20/00-54-19" # Best proteoscope - ESM-full

# BASE_DIR = "/home/ec2-user/outputs-proteoscope/2023-08-04/16-34-38" # 25 x 25 cond latent
# BASE_DIR = "/home/ec2-user/outputs-proteoscope/2023-08-04/22-36-50" # 25 x 25 cond nuclei + latent
# BASE_DIR = "/home/ec2-user/outputs-proteoscope/2023-08-05/01-50-26" # 25 x 25 cond nuclei + latent

config_dir = BASE_DIR + "/.hydra"

with hydra.initialize_config_dir(config_dir=config_dir):
    config = hydra.compose(config_name="config", overrides=OmegaConf.load(config_dir + "/overrides.yaml"))

    chkpts = glob(BASE_DIR + "/checkpoints/*.ckpt")
    chkpts.sort()
    chkpt = chkpts[-1]
    print('   Using ', chkpt)


    pdm = ProteoscopeDM(
        images_path=config.data.images_path,
        labels_path=config.data.labels_path,
        trim=config.data.trim,
        sequences_path=config.data.sequences_path,
        batch_size=config.trainer.batch_size,
        num_workers=config.trainer.num_workers,
        sequence_embedding=config.data.sequence_embedding,
        splits=config.splits,
        sequence_dropout=config.data.sequence_dropout
    )
    pdm.setup()

    plm = ProteoclipLM.load_from_checkpoint(
        chkpt,
        module_config=config.module,
    )

    plm.eval()
    plm.cuda()

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with hydra.initialize_config_dir(config_dir=config_dir):


   Using  /home/ec2-user/outputs/proteoclip/2023-09-20/01-06-01/checkpoints/last.ckpt


  self.labels = pd.read_csv(self.labels_path, index_col=0)


loaded pretrained LPIPS loss from taming/modules/autoencoder/lpips/vgg.pth


## Generate samples

In [None]:
dl = pdm.test_dataloader()

In [1]:
from tqdm import  tqdm
import torch


true_labels = []
predicted_labels = []

for batch in tqdm(dl):
    batch['sequence_embed'] = batch['sequence_embed'].to('cuda')
    batch['sequence_mask'] = batch['sequence_mask'].to('cuda')    
    batch['truncation'] = batch['truncation'].to('cuda')    
    logits = plm(batch)
    prediction = torch.argmax(logits, -1)
    predicted_labels.append(prediction.detach().cpu().numpy())
    true_labels.append(batch['localization'].detach().cpu().numpy())
true_labels = np.concatenate(true_labels, axis=0)
predicted_labels = np.concatenate(predicted_labels, axis=0)

NameError: name 'dl' is not defined

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score


acc = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy {acc}')

cm = confusion_matrix(true_labels, predicted_labels)

In [None]:
import matplotlib.pyplot as plt

plt.imshow(cm / cm.sum(axis=1)[:, None])

## Run on OpenCell protein embeddings

In [None]:
# import pandas as pd

# seq_path = '/home/ec2-user/cytoself-data/sequences.csv'
# seq = pd.read_csv(seq_path, index_col=0)
# # seq['Length'] = seq["Peptide"].apply(lambda x: len(x.replace("*", "")))
# # seq.to_csv(seq_path)

In [38]:
from proteoscope.data import ProteolocDM


plds = ProteolocDM(
    labels_path='/home/ec2-user/cytoself-data/sequences.csv',
    sequences_path=None, #'/home/ec2-user/cytoself-data/esm2_t36_3B_UR50D.zarr', #None, '/home/ec2-user/cytoself-data/ESM_sequence_embeddings_full.zarr',
    batch_size=1,
    num_workers=0,
    sequence_embedding=None, #'ESM-full', # None, 'ESM-full',
)
plds.setup()

In [46]:
# plm.protein_projection = plm.protein_projection.half()

In [47]:
from tqdm import  tqdm
import torch


embeds = []
with torch.no_grad():
    for batch in tqdm(plds.predict_dataloader()):
        # batch['sequence_embed'] = batch['sequence_embed'].to('cuda')
        # batch['sequence_mask'] = batch['sequence_mask'].to('cuda')    
        batch['truncation'] = batch['truncation'].to('cuda')    
        seq_embeds = plm.embed(batch)
        embeds.append(seq_embeds.detach().cpu().numpy())

100%|██████████| 1311/1311 [01:47<00:00, 12.24it/s]


In [48]:
embeds = np.concatenate(embeds, axis=0)

### For protein embeddings

In [49]:
import zarr
import os

PROTEIN_EMBED_PATH = '/home/ec2-user/cytoself-data/ESM_sequence_embeddings_clip_lora.zarr'

z_embedding_prot = zarr.open(
    PROTEIN_EMBED_PATH,
    mode="w",
        shape=(len(embeds), config.module.model.projection_dims),
        chunks=(1, None),
    dtype="float32",
)

In [50]:
z_embedding_prot[:, :] = embeds

In [51]:
z_embedding_prot.shape

(1311, 1024)

### For Sequence Embeddings

In [None]:
import zarr
import os


# PROTEIN_EMBED_PATH = '/home/ec2-user/cytoself-data/ESM_sequence_embeddings_full_lora3.zarr'


z_embedding_prot = zarr.open(
    PROTEIN_EMBED_PATH,
    mode="w",
        shape=(len(embeds), config.module.model.truncation_seq_length + 1, config.module.model.d_model),
        chunks=(1, None, None),
    dtype="float32",
)

In [None]:
for i, zz in enumerate(embeds):
    ll = min(1024, len(zz[0]))
    z_embedding_prot[i, 1:1+ll, :] = zz[0][:ll]

## LoRA

In [1]:
from esm.esmfold.v1.esmfold import ESMFold

In [2]:
from peft import LoraConfig, TaskType
import esm
import torch

In [2]:
import sys
import pytorch_lightning

# Create a module for the old path
sys.modules['pytorch_lightning.utilities.seed'] = sys.modules['lightning_fabric.utilities.seed']


In [3]:
model = esm.pretrained.esmfold_v1()

In [6]:
sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"

In [7]:
alphabet = esm.data.Alphabet.from_architecture("ESM-1b")

In [8]:
# model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()

In [8]:
converter = alphabet.get_batch_converter(1024)

In [9]:
model = model.cuda().half();

In [11]:
n = 4
a = ['a'] * n
b = ['M' * 1024] * n

result = list(zip(a, b))
labels, strs, toks = converter(result)
toks = toks.to('cuda')

In [12]:
res = model.esm(
    toks,
    repr_layers=range(model.esm.num_layers + 1),
)
esm_s = torch.stack(
    [v for _, v in sorted(res["representations"].items())], dim=2
)
# Drop BOS/EOS
esm_s = esm_s[:, 1:-1]  # B, L, nLayers,
esm_s = esm_s.to(model.esm_s_combine.dtype)
esm_s = esm_s.detach()

In [18]:
esm_sm = (model.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s).squeeze(2)

s_s_0 = model.esm_s_mlp(esm_sm)

In [19]:
s_s_0.shape

torch.Size([4, 1024, 1024])

In [9]:
trainable_params = [p for p in model.parameters() if p.requires_grad]

In [10]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(trainable_params) 

In [11]:
optimizer.zero_grad()
out = model(toks, repr_layers=[33], return_contacts=False)
output = out['representations'][33]
loss = criterion(output, torch.zeros_like(output))
optimizer.step()

OutOfMemoryError: CUDA out of memory. Tried to allocate 162.00 MiB (GPU 0; 15.78 GiB total capacity; 14.53 GiB already allocated; 22.19 MiB free; 14.86 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 

In [None]:
# optimizer.zero_grad()  # Assuming you have defined an optimizer
# output = model(dummy_input)
# loss = criterion(output, dummy_target)
# loss.backward()
# optimizer.step()  # Opt

In [None]:
with torch.cuda.amp.autocast():


In [None]:
list(model.parameters())[0].numel()

In [None]:
trainable = 0
not_trainable = 0
for p in model.parameters():
    if p.requires_grad:
        trainable += p.numel()
    else:
        not_trainable += p.numel()
print(trainable, not_trainable, trainable / (trainable + not_trainable) * 100)

In [6]:
peft_config = LoraConfig(target_modules=["k_proj", "v_proj", "q_proj", "out_proj"], inference_mode=False, r=4, lora_alpha=4, lora_dropout=0.1)

In [7]:
from peft import get_peft_model

In [8]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,351,680 || all params: 652,394,934 || trainable%: 0.20718738444403678


In [None]:
trainable = 0
not_trainable = 0
for p in model.parameters():
    if p.requires_grad:
        trainable += p.numel()
    else:
        not_trainable += p.numel()
print(trainable, not_trainable, trainable / (trainable + not_trainable) * 100)

In [None]:
print(model)

In [None]:
import pandas as pd

data_path = '/home/ec2-user/cytoself-data/labels.csv'
data_path2 = '/home/ec2-user/cytoself-data/sequences.csv'
df = pd.read_csv(data_path, index_col=0)
df2 = pd.read_csv(data_path2, index_col=0)

In [None]:
df2['loc'] = df2['localization'].astype('category').cat.codes

In [None]:
df2.to_csv(data_path2)