In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
import tqdm

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [3]:
MTMR_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path = sys.path if MTMR_PATH in sys.path else [MTMR_PATH] + sys.path

In [4]:
from MTMR.properties import gsk3, jnk3, drd2, qed, penalized_logp, similarity, get_kekuleSmiles

In [5]:
PROPERTY_NAME = "jnk3"
SCORING_FT = jnk3()

In [15]:
if not os.path.exists(PROPERTY_NAME):
    os.mkdir(PROPERTY_NAME)

In [6]:
filepath_zinc250k = "250k_rndm_zinc_drugs_clean_3.csv"
df_zinc_raw = pd.read_csv(filepath_zinc250k)

In [7]:
df_zinc_raw.head()

Unnamed: 0,smiles,logP,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.0506,0.702012,2.084095
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.1137,0.928975,3.432004
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182


# Property calculation

In [8]:
records = []
for i in tqdm.trange(len(df_zinc_raw)):
    smi = df_zinc_raw.loc[i,"smiles"]
    smi_kek = get_kekuleSmiles(smi)
    prop = SCORING_FT(smi_kek)
    if prop > 0:
        records.append((smi_kek, prop))
        
df_zinc = pd.DataFrame.from_records(records).rename(columns={0:"smiles",1:PROPERTY_NAME})
df_zinc.head()

100%|██████████| 249455/249455 [7:11:14<00:00,  9.64it/s]  


Unnamed: 0,smiles,jnk3
0,CC(C)(C)C1=CC=C2OC=C(CC(=O)NC3=CC=CC=C3F)C2=C1,0.01
1,CC1CC(C)CC(NC2=CN=CC(C3=NN=CN3C)=C2)C1,0.01
2,N#CC1=CC=C(C2=CC=C(OC(C(=O)N3CCCC3)C3=CC=CC=C3...,0.03
3,N#CC1=C(SCC(=O)NC2=CC=CC(Cl)=C2)N=C([O-])C(C#N...,0.02
4,CC[NH+](CC)C(C)(CC)C(O)C1=CSC=C1Br,0.04


In [16]:
df_zinc.to_csv(os.path.join(PROPERTY_NAME, "raw_data.csv"), index=False)

# Source and Target

In [14]:
idx_src = (0.05 >= df_zinc.loc[:,PROPERTY_NAME]) & (df_zinc.loc[:,PROPERTY_NAME] > 0)
df_src = df_zinc[idx_src].drop_duplicates()
df_src

Unnamed: 0,smiles,jnk3
0,CC(C)(C)C1=CC=C2OC=C(CC(=O)NC3=CC=CC=C3F)C2=C1,0.01
1,CC1CC(C)CC(NC2=CN=CC(C3=NN=CN3C)=C2)C1,0.01
2,N#CC1=CC=C(C2=CC=C(OC(C(=O)N3CCCC3)C3=CC=CC=C3...,0.03
3,N#CC1=C(SCC(=O)NC2=CC=CC(Cl)=C2)N=C([O-])C(C#N...,0.02
4,CC[NH+](CC)C(C)(CC)C(O)C1=CSC=C1Br,0.04
...,...,...
145236,CCC(NC(=O)C(=O)NC1=CC=C(C(N)=O)C(C)=C1)C1=C(C)...,0.04
145237,COC1=CC(C[NH2+]CC(C)CC(C)O)=CC(Br)=C1O,0.02
145238,CC1=CC=C(NC(=O)C(=O)N(C)CC2=CC=CC=C2)C(C)=C1,0.05
145239,CC1=CC(C(=O)NC2=CC=C(OCC(N)=O)C=C2)=C(C)N1C1CC1,0.01


In [13]:
idx_tar = (1. >= df_zinc.loc[:,PROPERTY_NAME]) & (df_zinc.loc[:,PROPERTY_NAME] >= 0.5)
df_tar = df_zinc[idx_tar].drop_duplicates()
df_tar

Unnamed: 0,smiles,jnk3
3431,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,0.66
6752,C[NH+]1CCC2=C(C1)SC(NC(=O)C1=CC=CC=C1F)=C2C#N,0.5
27181,COC1=CC=C(C2=CC=NC(NC3=CC=CC=C3)=N2)C=C1,0.68
31296,O=C(NC1=CC=CC=C1)NC1COC2C(NC3=NC=CC(C4=CC=C(C5...,0.5
36483,OC1=CC=C(NC2=NC(C3=CC=C(Cl)C=C3)=CS2)C=C1,0.67
51241,O=C1NN(C2=CC=CC(Cl)=C2)C(=O)C1=CC1=CC=C(O)C([N...,0.62
101602,N#CC1=C(NC(=O)C2=CC=CC=C2OC(F)F)SC2=C1CCC2,0.55
107349,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,0.51


# Test and validation

In [17]:
df_test = df_src.iloc[-2000:-1000,:]
df_valid = df_src.iloc[-1000:,:]

In [18]:
df_test.loc[:,"smiles"].to_csv(os.path.join(PROPERTY_NAME, "rdkit_test.txt"), index=False, header=None)
df_valid.loc[:,"smiles"].to_csv(os.path.join(PROPERTY_NAME, "rdkit_valid.txt"), index=False, header=None)

# Paired

In [22]:
records = []
for smi_tar, prop_tar in df_tar.values:
    _records = []
    pbar = tqdm.tqdm(df_src.iloc[:-2000,:].sample(frac=1).values)
    for smi_src, prop_src in pbar:
        sim = similarity(smi_src, smi_tar)
        if sim >= 0.4:
            _records.append((smi_src, smi_tar, prop_src, prop_tar, sim))
    print(len(_records))
    records = records + _records

df_train_pairs = pd.DataFrame.from_records(records).rename(columns={0:"src",1:"tar",2:"prop_src",3:"prop_tar",4:"similarity"})
df_train_pairs

100%|██████████| 126728/126728 [00:30<00:00, 4133.88it/s]


14


100%|██████████| 126728/126728 [00:23<00:00, 5483.44it/s]


3


100%|██████████| 126728/126728 [00:23<00:00, 5403.75it/s]


2


100%|██████████| 126728/126728 [00:34<00:00, 3672.76it/s]


1


100%|██████████| 126728/126728 [00:22<00:00, 5595.20it/s]


16


100%|██████████| 126728/126728 [00:24<00:00, 5152.40it/s]


0


100%|██████████| 126728/126728 [00:23<00:00, 5358.08it/s]


35


100%|██████████| 126728/126728 [00:23<00:00, 5421.10it/s]

45





Unnamed: 0,src,tar,prop_src,prop_tar,similarity
0,CCOCC1=CC=CC=C1CN1C(=O)NC2=CC=CC=C2C1=O,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,0.01,0.66,0.403226
1,O=C(CN1C(=O)NN(C2=CC=CC=C2)C1=O)N1CCN(C2=CC=CC...,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,0.01,0.66,0.403226
2,CC1CN(C2=CC=C(CN3C(=O)NC4=CC=CC=C4C3=O)C=[NH+]...,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,0.03,0.66,0.454545
3,CC(=O)C1=CC=C2NC(C)=C(CN3CCN(C4=CC=CC=C4)CC3)C...,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,0.03,0.66,0.405797
4,O=C(C1=NN(C2=CC=CC=C2)C(=O)C2=CC=CC=C12)N1CCN(...,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,0.01,0.66,0.433333
...,...,...,...,...,...
111,CC1=CC=C(S(=O)(=O)N2CCOCC2)C=C1NC(=O)CC1=CC=CC...,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,0.04,0.51,0.412698
112,CC(=O)NC1=NC(CC(=O)NC2=CC(C)=CC=C2C)=CS1,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,0.01,0.51,0.481481
113,CC1=CC=C(C)C(OCC2=NC(CC(=O)NC3=CC=CC=C3C)=CS2)=C1,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,0.02,0.51,0.419355
114,CC1=CC=C(C)C(NC(=O)CN2C(=O)NC(C)(C3=CC=CC=C3Br...,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,0.04,0.51,0.419355


In [23]:
df_train_pairs.iloc[:,:2].to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_pairs.txt"), sep=" ", header=None)

In [24]:
df_train_src = df_train_pairs.iloc[:,0]
df_train_src.to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_src.txt"), sep=" ", header=None)

In [25]:
df_train_tar = df_train_pairs.iloc[:,1]
df_train_tar.to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_tar.txt"), sep=" ", header=None)

In [26]:
list_smi_src = df_train_src.values.tolist()
list_smi_tar = df_train_tar.values.tolist()

In [27]:
list_chembl_unique = list(set(list_smi_src + list_smi_tar))
print(len(list_chembl_unique))

122


In [29]:
list_triplet = []
K = 20

for i, (smi_src, smi_tar) in tqdm.tqdm(enumerate(zip(list_smi_src, list_smi_tar)), total=len(list_smi_src)):
    batch_list_triplet = []
    ## mol
    mol_src = Chem.MolFromSmiles(smi_src)
    mol_tar = Chem.MolFromSmiles(smi_tar)
    ## fingerprint
    fp_src = AllChem.GetMorganFingerprintAsBitVect(mol_src, radius=2, nBits=2048, useChirality=False)
    fp_tar = AllChem.GetMorganFingerprintAsBitVect(mol_tar, radius=2, nBits=2048, useChirality=False)
    
    ## Shuffle
    random.shuffle(list_chembl_unique)
    
    for smi_chembl in list_chembl_unique:
        mol_chembl = Chem.MolFromSmiles(smi_chembl)
        fp_chembl = AllChem.GetMorganFingerprintAsBitVect(mol_chembl, radius=2, nBits=2048, useChirality=False)
        ## Tanimoto
        sim_src = DataStructs.TanimotoSimilarity(fp_src, fp_chembl)
        sim_tar = DataStructs.TanimotoSimilarity(fp_tar, fp_chembl)
        ## check
        if sim_src < 0.3 and sim_tar < 0.3:
            batch_list_triplet.append((smi_src, smi_tar, smi_chembl))
        ## stop
        if len(batch_list_triplet) == K:
            break
            
    if len(batch_list_triplet) < K:
        print(f"[WARNING] {i} has insufficient data ({len(batch_list_triplet)} < {K})")
        
    list_triplet.extend(batch_list_triplet)

100%|██████████| 116/116 [00:00<00:00, 319.86it/s]


In [30]:
df_triplet = pd.DataFrame(list_triplet)

df_triplet.to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_triplet.txt"), sep=" ", header=None, index=False)

In [31]:
df_triplet

Unnamed: 0,0,1,2
0,CCOCC1=CC=CC=C1CN1C(=O)NC2=CC=CC=C2C1=O,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,CC1=NC(C2=CC=C(NC(=O)CC3=CC=C(Cl)C=C3)C=C2)=CS1
1,CCOCC1=CC=CC=C1CN1C(=O)NC2=CC=CC=C2C1=O,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,N#CC1=C(NC(=O)C2CCCCCC2[NH3+])SC2=C1CCC2
2,CCOCC1=CC=CC=C1CN1C(=O)NC2=CC=CC=C2C1=O,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,CC1CC2=CC=CC=C2N1CC(=O)NC1=C(C#N)C2=C(CCCC2)S1
3,CCOCC1=CC=CC=C1CN1C(=O)NC2=CC=CC=C2C1=O,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,COC(=O)N1CCC2=C(C1)SC(NC(=O)C1=NC3=CC=CC=C3S1)...
4,CCOCC1=CC=CC=C1CN1C(=O)NC2=CC=CC=C2C1=O,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...,CC1=CC=C(NC(=O)CC2=COC3=CC(C)=CC=C23)C(Cl)=C1
...,...,...,...
2315,O=C(CCCCO)NCC1=CC=CC2=CC=CC=C12,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,O=C(C1=CC=C(CN2C(=O)NC3=CC=CC=C3C2=O)C=C1)N1CC...
2316,O=C(CCCCO)NCC1=CC=CC2=CC=CC=C12,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,CC(=O)NC1=CC=C(NC2=NC(C3=CC=CS3)=CS2)C=C1
2317,O=C(CCCCO)NCC1=CC=CC2=CC=CC=C12,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,CCCC(=O)NC1=C(C#N)C2=C(CCCCC2)S1
2318,O=C(CCCCO)NCC1=CC=CC2=CC=CC=C12,CC1=CC=C(C)C(NC(=O)CC2=CC=CC3=CC=CC=C23)=C1,[NH3+]CC1=CC=C(C2=NC(C3=CC=C(Cl)C=C3)=CS2)C=C1
