# SELFIES embeddings precomputation

In this notebook, we precompute all of the embeddings for the SMILES in our train and test sets.

## Loading SELFormer

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import joblib
import json

from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from torch.utils.data import TensorDataset, DataLoader
import torch

from tqdm import tqdm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device : {device}")
os.chdir("/home/python/")

In [None]:
model_name = "./SELFormer/data/pretrained_models/modelO" # path of the pre-trained model

config = RobertaConfig.from_pretrained(model_name)
config.output_hidden_states = True

tokenizer = RobertaTokenizer.from_pretrained("./SELFormer/data/RobertaFastTokenizer")
model = RobertaModel.from_pretrained(model_name, config=config).to(device)

## Generating SELFIES embeddings

In [None]:
def compute_embeddings(selfies):
    selfies_tokens = tokenizer(selfies, add_special_tokens=True, max_length=256, padding='max_length', truncation=True, return_tensors="pt")
    dataset = TensorDataset(selfies_tokens.input_ids, selfies_tokens.attention_mask)
    dataloader = DataLoader(dataset, batch_size=24, shuffle=False, num_workers=2)

    embeddings = []
    for i, batch in enumerate(tqdm(dataloader)):

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch

        with torch.no_grad():
            model.eval()
            selfies_embeddings = model(input_ids=b_input_ids, attention_mask=b_input_mask)
        
        last_hidden_states = selfies_embeddings.last_hidden_state
        
        embeddings.append(last_hidden_states.cpu().numpy())

    embeddings_np = np.concatenate(embeddings, axis=0)

    return embeddings_np

In [None]:
df = pd.read_csv("./data/data/drp.csv") # path of the selfies data
df.head()

In [None]:
selfies = list(np.unique(df["selfies"].values))
len(selfies)

In [None]:
selfies_embeddings = compute_embeddings(selfies)

In [None]:
compound_embeddings_dict = {selfies: embedding for selfies, embedding in zip(selfies, list(selfies_embeddings))}
compound_embeddings_dict[selfies[0]].shape

In [None]:
joblib.dump(compound_embeddings_dict, os.path.join("./data", "data", "selfies_embeddings_dict.joblib"))

In [None]:
joblib.load(os.path.join("./data", "data", "selfies_embeddings_dict.joblib"))[selfies[0]].shape