# Tutorial - Step4 : Molecular translation using MTMR

## 1. Import requirements

In [1]:
import os
import pandas as pd
import time
import tqdm
import torch
from torch.utils.data import DataLoader
from rdkit.Chem.rdmolfiles import MolFromSmiles

In [2]:
from MTMR.dataset import ValidationSmilesDataset
from MTMR.vae import SmilesAutoencoder

## 2. Configure GPU (if available)

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

cuda:0


## 3. Specify a target property

In [4]:
PROPERTY_NAME = "qed"

## 4. Set directories (for inputs and outputs)

In [5]:
input_data_dir = os.path.join("DATA", PROPERTY_NAME)
input_ckpt_dir = os.path.join("outputs_Tutorial_2_MTMR_finetuning", PROPERTY_NAME)

In [6]:
_output_dir = "outputs_Tutorial_4_MTMR_translation"
if not os.path.exists(_output_dir):
    os.mkdir(_output_dir)

output_dir = os.path.join(_output_dir, PROPERTY_NAME)
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

## 5. Set file names

In [7]:
filepath_test = os.path.join(input_data_dir, "rdkit_test.txt")

In [8]:
filepath_pretrain_ckpt     = os.path.join(input_ckpt_dir, "checkpoints.pt")
filepath_pretrain_configs  = os.path.join(input_ckpt_dir, "configs.csv")
filepath_pretrain_char2idx = os.path.join(input_ckpt_dir, "char2idx.csv")

In [9]:
filepath_output = os.path.join(output_dir, f"MTMR_{PROPERTY_NAME.upper()}.csv")

## 6. Load datasets (for test)

In [10]:
dataset_test = ValidationSmilesDataset(filepath_test, filepath_pretrain_char2idx, device=device)

## 7. Load a pretrained generator of MTMR

In [11]:
## Model configuration
model_configs = {"hidden_size"    :None,
                 "latent_size"    :None,
                 "num_layers"     :None,
                 "vocab_size"     :None,
                 "sos_idx"        :None,
                 "eos_idx"        :None,
                 "pad_idx"        :None,
                 "device"         :device,
                 "filepath_config":filepath_pretrain_configs}

## Model initialization
generator = SmilesAutoencoder(**model_configs)

## Load pretrained model
generator.load_model(filepath_pretrain_ckpt)

## 8. Perform molecular translation on the Test dataset

In [12]:
K = 20 # repetition count of translation

generated = [] # initialize a list of outputs

for batch in tqdm.tqdm(DataLoader(dataset_test, batch_size=1, shuffle=False, drop_last=False, pin_memory=use_cuda)):
    batch_smiles = dataset_test.encode(batch["smiles_s"], batch["length_s"].max())
    batch_length = batch["length_s"]
    ## translation
    is_generated = False
    for _ in range(K):
        seq = generator.predict(batch_smiles, batch_length)
        smi = dataset_test.decode(seq)[0] # assumption: batch_size=1
        if MolFromSmiles(smi):
            generated.append((batch["smiles_s"][0][1:-1], smi))
            is_generated = True
            break
    if not is_generated:
        generated.append((batch["smiles_s"][0][1:-1],""))
        
df_generated = pd.DataFrame.from_records(generated)
df_generated.head()

100%|██████████| 800/800 [00:14<00:00, 54.04it/s]


Unnamed: 0,0,1
0,CC(=O)NCCNC(=O)C1=C(C2CC2)N(C2=CC=C(C)C(Cl)=C2...,CC(=O)NCC1CCN(C(=O)C2=CC=C(Cl)C(C)=C2)N=C1
1,CC(C(=O)C1=C2C=CC=CC2=[NH+]C1)[NH+]1CCCC1C1CC=CS1,CC(C(=O)NC1=CC=CC=C1)N1CCCC1C1CCCO1
2,CCN(CC1CCOC1)C(=O)C1=CC=NC(Cl)=C1,CCN(CC1CCOCC1)C(=O)NC1=CC=CC(Cl)=C1
3,CC1=CC=CC=C1CS(=O)CCCC1=CC=CC=C1,CC1=CC=CC=C1CC(=O)NCC1=CC=CC=C1
4,CSCC(=O)NNC(=O)C1=C(O)C=C(Cl)C=C1Cl,CSCC(=O)NC1=C(Cl)C=C(Cl)C=C1Cl


## 9. Save the results

In [13]:
df_generated.to_csv(filepath_output, header=None, index=False)