# SMILES embeddings precomputation

In this notebook, we precompute all of the embeddings for the SMILES in our train and test sets.

## Loading ChemBERTa

In [None]:
from transformers import RobertaModel, RobertaTokenizer
import pandas as pd
from tqdm import tqdm
import torch
import numpy as np
import joblib
import os

os.chdir('/home/python/data')
os.environ["TOKENIZERS_PARALLELISM"] = "true"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
model = RobertaModel.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k").to(device)
tokenizer = RobertaTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")

## Generating SMILES Embeddings

In [None]:
from torch.utils.data import TensorDataset, DataLoader

def compute_embeddings(smiles):
    smiles_tokens = tokenizer(smiles, add_special_tokens=True, max_length=256, padding='max_length', truncation=True, return_tensors="pt")
    dataset = TensorDataset(smiles_tokens.input_ids, smiles_tokens.attention_mask)
    dataloader = DataLoader(dataset, batch_size=24, shuffle=False, num_workers=2)

    embeddings = []
    for i, batch in enumerate(tqdm(dataloader)):

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch

        with torch.no_grad():
            model.eval()
            smiles_embeddings = model(input_ids=b_input_ids, attention_mask=b_input_mask, output_hidden_states=True)
        
        last_hidden_states = smiles_embeddings.last_hidden_state
        
        embeddings.append(last_hidden_states.cpu().numpy())

    embeddings_np = np.concatenate(embeddings, axis=0)

    return embeddings_np

In [None]:
df = pd.read_csv('./data/drp.csv')
df.head()

In [None]:
smiles = list(np.unique(df['smiles'].values))
print(f"Number of unique smiles: {len(smiles)}")

In [None]:
smiles_embeddings = compute_embeddings(smiles)

In [None]:
compound_embeddings_dict = {s: embedding for s, embedding in zip(smiles, list(smiles_embeddings))}
compound_embeddings_dict[smiles[0]].shape

In [None]:
joblib.dump(compound_embeddings_dict, os.path.join("data", "smiles_embeddings_dict.joblib"))

In [None]:
joblib.load(os.path.join("data", "smiles_embeddings_dict.joblib"))[smiles[0]].shape

## Generating SMILES Tokens as Embeddings

In [None]:
df = pd.read_csv('./data/drp.csv')
df.head()

In [None]:
smiles = list(np.unique(df['smiles'].values))
print(f"Number of unique smiles: {len(smiles)}")

In [None]:
smiles_tokens_ids = tokenizer(smiles, add_special_tokens=True, max_length=256, padding='max_length', truncation=True, return_tensors="np")['input_ids']
smiles_tokens_ids.shape

In [None]:
compound_embeddings_dict = {s: embedding for s, embedding in zip(smiles, list(smiles_tokens_ids))}
compound_embeddings_dict[smiles[0]].shape

In [None]:
joblib.dump(compound_embeddings_dict, os.path.join("data", "smiles_tokens_embeddings_dict.joblib"))

In [None]:
joblib.load(os.path.join("data", "smiles_tokens_embeddings_dict.joblib"))[smiles[0]].shape