In [1]:
import sys
sys.path.append('/workspace/')

import sqlite3
import mols2grid
import importlib
import pickle
import itertools
import concurrent
import pandas as pd

import numpy as np
import pandas as pd

import cuml
import cupy as cp

from functools import partial
from subprocess import run
from rdkit import Chem
from rdkit.Chem import Draw, QED, Descriptors, Lipinski, rdDistGeom, rdmolfiles

from flow.pipeline.screening.pose_generate import score_molecule, generate_conformers
from flow.utils.megamolbart import smiles_to_embedding, embedding_to_smiles, sample

import warnings
warnings.filterwarnings('ignore')

# Code to disable rdkit errors and warning
import rdkit.rdBase as rkrb
import rdkit.RDLogger as rkl

log = rkl.logger()
log.setLevel(rkl.ERROR)
rkrb.DisableLog('rdApp.error')

In [2]:
workspace = '/content/28b0e566-a15a-11ec-83ca-7de881115940'

receptor_file = f'{workspace}/inputs/rec.pdbqt'
score_config = f'{workspace}/inputs/config'

#### Please start MegaMolBART service before this cell
```
docker-compose --env-file .env\
                -f support/docker/megamolbart/docker-compose.yml\
                up  --scale megamolbart=2
```

In [3]:
%%time
db_url = '/data/chembl.db'
conn = sqlite3.connect(db_url, uri=True)
generation = 0

df = pd.read_sql(
    '''
    SELECT canonical_smiles as smiles
    FROM compound_structures 
    ORDER BY random()
    LIMIT 100
    ''', 
    con=conn)

CPU times: user 324 ms, sys: 15.3 ms, total: 339 ms
Wall time: 338 ms


In [4]:
%%time

x0_smis = []
x0_dims = []
x0_embs = []
y0_scrs = []

x1_smis = []
x1_dims = []
x1_embs = []
y1_scrs = []

for smi in df['smiles'].tolist():
    embs = None
    while True:
        try:
            embs = sample(smi,
                          num_sample=1, 
                          padding_size=512,
                          service_port='localhost:50052')
        except Exception as ex:
            print(ex)
            break
        ip = embs[0]
        op = embs[1]
        
        ip_emb = ip['embedding']
        op_emb = op['embedding']
        
        op_mol = Chem.MolFromSmiles(op['smiles'])
        if op_mol is None or ip['smiles'] == op['smiles']:
            # print(f'{x1_smi} is invalid or same as input {x0_smi}')
            continue

        x0_embs.append(cp.reshape(cp.array(ip_emb.embedding), ip_emb.dim))
        x0_dims.append(list(ip_emb.dim))
        x0_smis.append(ip['smiles'])
        y0_scrs.append(Descriptors.MolWt(Chem.MolFromSmiles(ip['smiles'])))

        x1_embs.append(cp.reshape(cp.array(op_emb.embedding), op_emb.dim))
        x1_dims.append(list(op_emb.dim))
        x1_smis.append(op['smiles'])
        y1_scrs.append(Descriptors.MolWt(op_mol))
        break

CPU times: user 12.2 s, sys: 2.11 s, total: 14.3 s
Wall time: 11min 38s


In [5]:
# y1_scrs[:5], len(y1_scrs)

Filter the matching input and sample pairs.

In [6]:
data = dict({
    'x0_smis': x0_smis,
    'x0_dims': x0_dims,
    'x0_embs': x0_embs,
    'y0_scrs': y0_scrs,
    'x1_smis': x1_smis,
    'x1_dims': x1_dims,
    'x1_embs': x1_embs,
    'y1_scrs': y1_scrs
    })

with open('/workspace/test_data.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump(data, file)

In [7]:
with open('/workspace/test_data.pkl', 'rb') as file:
    # A new file will be created
    data = pickle.load(file)

In [8]:
# data