In [None]:
import sys
sys.path.append('../../')

import os
import re
import math
import grpc
import sqlite3
import numpy as np
import pandas as pd
import cuml
import cupy as cp
import cudf as cd
from importlib import reload

from sklearn.linear_model import Ridge
from subprocess import run

from rdkit import Chem
from rdkit.Chem import Draw, QED, Descriptors, Lipinski, rdDistGeom, rdmolfiles
from flow.utils.megamolbart import sample, interpolate, smiles_to_embedding
from generativesampler_pb2 import GenerativeSpec, GenerativeModel, EmbeddingList
from generativesampler_pb2_grpc import GenerativeSamplerStub

In [None]:
%%time
db_file = '/content/6e6e2b68-9ef8-11ec-83ca-7de881115940/common.sqlite3'

conn = sqlite3.connect(db_file, uri=True)
df = pd.read_sql(
    '''
    SELECT generated_smiles.smiles, 
           generated_smiles.embedding, 
           generated_smiles.embedding_dim,
           generated_smiles.score
    FROM generated_smiles 
    WHERE score is not null
    ''', 
    con=conn)
# orig_embs = []
# embs = []
# dims = []
# for smiles in df['smiles'].tolist():
#     # print(f'Coverting {smiles} to embedding...')
#     emb = smiles_to_embedding(smiles)
#     orig_embs.append(emb)
#     embs.append(list(emb.embedding))
#     dims.append(list(emb.dim))

# scores = cp.array(df['score'])
df

In [None]:
y = cp.asnumpy(scores)
X = cp.asnumpy(embs)

clf = Ridge(alpha=1.0)
clf.fit(X, y);

In [None]:
min_idx = np.argmin(y)

orig_emb = orig_embs[min_idx]
dim = orig_emb.dim
mask = orig_emb.pad_mask

In [None]:
direction = clf.coef_
emb_std = np.std(embs, axis=0)

In [None]:
min_idx = np.argmin(y)
max_idx = np.argmax(y)

emb_min = np.array(embs[min_idx])
emb_max = np.array(embs[max_idx])
diff = np.linalg.norm(emb_max - emb_min) / math.sqrt(emb_max.shape[0])

In [None]:
direction_sampled = np.random.normal(loc=direction, scale=emb_std, size=emb_std.shape)

In [None]:
# diff, emb_min, emb_max, emb_max.shape[0]
diff

In [None]:
step = float(1 * diff) * direction_sampled

In [None]:
next_emb = emb_min - step

In [None]:
step, direction_sampled

In [None]:
type(next_emb)

In [None]:
def embedding_to_smiles(emb, dim, mask):
    service_port = 'localhost:50051'

    spec = EmbeddingList(embedding=emb,
                         dim=dim,
                         pad_mask=mask)
    with grpc.insecure_channel(f'{service_port}') as channel:
        stub = GenerativeSamplerStub(channel)
        first_smiles = stub.EmbeddingToSmiles(spec)

    return first_smiles
first_smiles = embedding_to_smiles(next_emb, dim, mask)
first_smiles

In [None]:
def add_jitter(embedding, radius, cnt, shape):
    distorteds = []
    for i in range(cnt):
        noise = np.random.normal(0, radius, embedding.shape)
        distorted = noise + embedding
        distorteds.append(distorted)

    return distorteds

jittered_embs = add_jitter(next_emb, 0.2, 10, None)
mols = []
gsmiles = []
for jittered_emb in jittered_embs:
    print('jittered_embs', type(jittered_embs), type(jittered_embs[0]), len(jittered_embs[0]), type(dim), type(mask), type(dim))
    m_gsmiles = embedding_to_smiles(jittered_emb, dim, mask)
    try:
        mol = Chem.MolFromSmiles(m_gsmiles.generatedSmiles[0])
        if mol is not None:
            mols.append(mol)
            print(m_gsmiles.generatedSmiles)
            gsmiles.append(m_gsmiles.generatedSmiles[0])
    except Exception as ex:
        pass

gsmiles = set(gsmiles)
len(gsmiles)

In [None]:
gsmiles = list(gsmiles)
idx = 0
print(gsmiles[idx])
mols[idx]

In [None]:
generated_smiles = gsmiles
smiles_id = 0
cpu_cnt = os.cpu_count()//2

artifact_dir = '/tmp/artifact_dir'
os.makedirs(artifact_dir, exist_ok=True)

num_conformers=10
params = rdDistGeom.ETKDGv2()
params.pruneRmsThresh = 0.01
params.randomSeed = 42
params.numThreads = 0
params.clearConfs = True
params.maxIterations = 1000

In [None]:
for smiles in generated_smiles:
    smiles_id += 1
    try:
        mol = Chem.MolFromSmiles(smiles)
        mol = Chem.AddHs(mol, addCoords=True)
    except Exception as e:
        continue

    conformers = rdDistGeom.EmbedMultipleConfs(mol, numConfs=num_conformers, params=params)
    for cid in conformers:
        conformer_file = f'{artifact_dir}/{smiles_id}_{cid}.pdb'
        rdmolfiles.MolToPDBFile(mol, conformer_file, confId=cid)

        cmd = f'''
               cd {artifact_dir};
               /home/rilango/.conda/envs/mgltools/bin/python2 /home/rilango/.conda/envs/mgltools/bin/prepare_ligand4.py \
                   -l {conformer_file} \
                   -o {conformer_file}qt \
                   -A bonds_hydrogens
               '''
        result = run(cmd, capture_output=True, shell=True)

        out_file = f'{artifact_dir}/{smiles_id}_{cid}_vina.pdbqt'
        log_file = f'{artifact_dir}/{smiles_id}_{cid}.log'
        cmd = ['vina',
            '--receptor', '/raid/drugdiscovery/cheminformatics/vs_screening/5871e7e8-8853-11ec-aa14-c7359b90650f/inputs/6y2g_clean.pdbqt', \
            '--ligand', f'{conformer_file}qt', \
            '--out', out_file, \
            '--log', log_file, \
            '--cpu', str(cpu_cnt), \
            '--config', '/raid/drugdiscovery/cheminformatics/vs_screening/5871e7e8-8853-11ec-aa14-c7359b90650f/inputs/config']
        result = run(' '.join(cmd), capture_output=True, shell=True)

        with open(out_file, 'r') as fh:
            lines = fh.read()
            scorelines = re.findall(r'REMARK VINA RESULT.*', lines)
            min_score = sys.maxsize
            score_model = None
            cnt = 1
            for scoreline in scorelines:
                score = float(scoreline.split()[3])
                if min_score > score:
                    min_score = score
                    score_model = cnt
                cnt += 1

        print(f'Score for {smiles_id}_{cid} is {min_score} from model {score_model}')