In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
import tqdm

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [3]:
MTMR_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path = sys.path if MTMR_PATH in sys.path else [MTMR_PATH] + sys.path

In [4]:
from MTMR.properties import gsk3, jnk3, drd2, qed, penalized_logp, similarity, get_kekuleSmiles

In [5]:
PROPERTY_NAME = "gsk3"
SCORING_FT = gsk3()

In [16]:
if not os.path.exists(PROPERTY_NAME):
    os.mkdir(PROPERTY_NAME)

In [6]:
filepath_zinc250k = "250k_rndm_zinc_drugs_clean_3.csv"
df_zinc_raw = pd.read_csv(filepath_zinc250k)

In [7]:
df_zinc_raw.head()

Unnamed: 0,smiles,logP,qed,SAS
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1\n,5.0506,0.702012,2.084095
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1\n,3.1137,0.928975,3.432004
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,4.96778,0.599682,2.470633
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,4.00022,0.690944,2.822753
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,3.60956,0.789027,4.035182


# Property calculation

In [10]:
records = []
for i in tqdm.trange(len(df_zinc_raw)):
    smi = df_zinc_raw.loc[i,"smiles"]
    smi_kek = get_kekuleSmiles(smi)
    prop = SCORING_FT(smi_kek)
    if prop > 0:
        records.append((smi_kek, prop))
        
df_zinc = pd.DataFrame.from_records(records).rename(columns={0:"smiles",1:PROPERTY_NAME})
df_zinc.head()

100%|██████████| 249455/249455 [7:11:15<00:00,  9.64it/s]  


Unnamed: 0,smiles,gsk3
0,CC(C)(C)C1=CC=C2OC=C(CC(=O)NC3=CC=CC=C3F)C2=C1,0.03
1,CC1CC(C)CC(NC2=CN=CC(C3=NN=CN3C)=C2)C1,0.16
2,N#CC1=CC=C(C2=CC=C(OC(C(=O)N3CCCC3)C3=CC=CC=C3...,0.03
3,CC[NH+](CC)C(C)(CC)C(O)C1=CSC=C1Br,0.02
4,O=C(NC1=NNC=N1)C1=CC=CN=C1NC1=CC=CC(F)=C1,0.27


In [17]:
df_zinc.to_csv(os.path.join(PROPERTY_NAME, "raw_data.csv"), index=False)

# Source and Target

In [18]:
idx_src = (0.05 >= df_zinc.loc[:,PROPERTY_NAME]) & (df_zinc.loc[:,PROPERTY_NAME] > 0)
df_src = df_zinc[idx_src].drop_duplicates(ignore_index=True)
df_src

Unnamed: 0,smiles,gsk3
0,CC(C)(C)C1=CC=C2OC=C(CC(=O)NC3=CC=CC=C3F)C2=C1,0.03
1,N#CC1=CC=C(C2=CC=C(OC(C(=O)N3CCCC3)C3=CC=CC=C3...,0.03
2,CC[NH+](CC)C(C)(CC)C(O)C1=CSC=C1Br,0.02
3,CC1=CC=C2N=C(SC(C)C(=O)NC3CCC(C)CC3)N(C)C(=O)C...,0.01
4,O=C(N1CCC2=C(F)C=CC(F)=C2C1)C1(O)CC2=CC=CC=C2C1,0.01
...,...,...
138475,CCC(NC(=O)C(=O)NC1=CC=C(C(N)=O)C(C)=C1)C1=C(C)...,0.03
138476,COC1=CC(C[NH2+]CC(C)CC(C)O)=CC(Br)=C1O,0.05
138477,CC1(C)CCC(CNC(=O)CN2N=CC3=CC=CC=C3C2=O)C2=CC=C...,0.04
138478,CC1=CC(C(=O)NC2=CC=C(OCC(N)=O)C=C2)=C(C)N1C1CC1,0.01


In [19]:
idx_tar = (1. >= df_zinc.loc[:,PROPERTY_NAME]) & (df_zinc.loc[:,PROPERTY_NAME] >= 0.5)
df_tar = df_zinc[idx_tar].drop_duplicates(ignore_index=True)
df_tar

Unnamed: 0,smiles,gsk3
0,BrC1=CC(CSC2=NN=C(C3=CC=CC=C3)O2)=CS1,0.570
1,O=S(=O)([N-]C1=CC=CC(C2=CNN=C2)=C1)C1=CNC2=NC=...,0.570
2,COC1=CC=CC(C2=C(C(=O)NCC3=CC=CC(Br)=C3)N=CO2)=C1,0.550
3,CC1=NN(C2=CC=C(F)C=C2)C=C1C1=CC=NC(NCC2=CC=CC=...,0.590
4,CC(C)CNC1=NC(N)=C([N+](=O)[O-])C(NCC2=CC=CO2)=N1,0.690
...,...,...
119,CC1=CC=CC(NC2=NC(NCC[NH+](C)C)=NC(N)=C2[N+](=O...,0.840
120,C1=CC=C(C2=C(C3CCCN(C4=NC=NC5=C4C=CN5)C3)NN=C2...,0.500
121,CC1=CC=C(C2=C(C3=CC=CC=N3)NC(CNC3=NC(N)=C(C)C=...,0.500
122,O=C1NCCCC2=C1N=C(C1=CC=NC3=CC=CC=C13)N2,0.510


# Test and Validation

In [24]:
df_test = df_src.iloc[-2000:-1000,:]
df_valid = df_src.iloc[-1000:,:]

In [27]:
df_test.loc[:,"smiles"].to_csv(os.path.join(PROPERTY_NAME, "rdkit_test.txt"), index=False, header=None)
df_valid.loc[:,"smiles"].to_csv(os.path.join(PROPERTY_NAME, "rdkit_valid.txt"), index=False, header=None)

# Paired

In [None]:
records = []
for smi_tar, prop_tar in df_tar.values:
    _records = []
    pbar = tqdm.tqdm(df_src.iloc[:-2000,:].sample(frac=1).values)
    for smi_src, prop_src in pbar:
        sim = similarity(smi_src, smi_tar)
        if sim >= 0.4:
            _records.append((smi_src, smi_tar, prop_src, prop_tar, sim))
    print(len(_records))
    records = records + _records

df_train_pairs = pd.DataFrame.from_records(records).rename(columns={0:"src",1:"tar",2:"prop_src",3:"prop_tar",4:"similarity"})
df_train_pairs

100%|██████████| 136480/136480 [00:23<00:00, 5768.14it/s]


3


100%|██████████| 136480/136480 [00:26<00:00, 5219.14it/s]


0


100%|██████████| 136480/136480 [00:24<00:00, 5465.40it/s]


14


100%|██████████| 136480/136480 [00:28<00:00, 4857.46it/s]


2


100%|██████████| 136480/136480 [00:23<00:00, 5711.84it/s]


2


100%|██████████| 136480/136480 [00:24<00:00, 5472.37it/s]


9


100%|██████████| 136480/136480 [00:23<00:00, 5838.44it/s]


0


100%|██████████| 136480/136480 [00:28<00:00, 4833.31it/s]


3


100%|██████████| 136480/136480 [00:28<00:00, 4802.03it/s]


0


100%|██████████| 136480/136480 [00:25<00:00, 5350.45it/s]


4


100%|██████████| 136480/136480 [00:24<00:00, 5576.64it/s]


16


100%|██████████| 136480/136480 [00:26<00:00, 5139.94it/s]


3


100%|██████████| 136480/136480 [00:23<00:00, 5694.03it/s]


3


100%|██████████| 136480/136480 [00:25<00:00, 5397.29it/s]


0


100%|██████████| 136480/136480 [00:27<00:00, 5042.68it/s]


0


100%|██████████| 136480/136480 [00:26<00:00, 5150.23it/s]


0


100%|██████████| 136480/136480 [00:23<00:00, 5704.23it/s]


0


100%|██████████| 136480/136480 [00:24<00:00, 5498.10it/s]


1


100%|██████████| 136480/136480 [00:23<00:00, 5773.85it/s]


0


100%|██████████| 136480/136480 [00:27<00:00, 5001.48it/s]


4


100%|██████████| 136480/136480 [00:25<00:00, 5418.10it/s]


11


100%|██████████| 136480/136480 [00:29<00:00, 4664.63it/s]


0


100%|██████████| 136480/136480 [00:24<00:00, 5650.03it/s]


0


100%|██████████| 136480/136480 [00:25<00:00, 5324.46it/s]


0


100%|██████████| 136480/136480 [00:27<00:00, 4983.85it/s]


0


 44%|████▍     | 59788/136480 [00:12<00:15, 5106.43it/s]

In [36]:
df_train_pairs = pd.DataFrame.from_records(records).rename(columns={0:"src",1:"tar",2:"prop_src",3:"prop_tar",4:"similarity"})
df_train_pairs

Unnamed: 0,src,tar,prop_src,prop_tar,similarity
0,O=C(CSC1=NN=C(C2=CC=C(Br)C=C2)O1)C1=CC=CS1,BrC1=CC(CSC2=NN=C(C3=CC=CC=C3)O2)=CS1,0.04,0.57,0.421053
1,O=C(CSC1=NN=C(C2=CC=CC(F)=C2)O1)NC1=NC(C2=CC=C...,BrC1=CC(CSC2=NN=C(C3=CC=CC=C3)O2)=CS1,0.02,0.57,0.415385
2,O=C(CSC1=NN=C(C2=CC=CC=C2)O1)NC1=CC=CC(O)=C1,BrC1=CC(CSC2=NN=C(C3=CC=CC=C3)O2)=CS1,0.01,0.57,0.413793
3,COC1=CC=CC(CNC(=O)C2=C(C)N(CC[NH+](C)C)C(C)=C2...,COC1=CC=CC(C2=C(C(=O)NCC3=CC=CC(Br)=C3)N=CO2)=C1,0.01,0.55,0.400000
4,COC1=CC=CC(CNC(=O)C2=C(C)N=CS2)=C1,COC1=CC=CC(C2=C(C(=O)NCC3=CC=CC(Br)=C3)N=CO2)=C1,0.02,0.55,0.467742
...,...,...,...,...,...
368,CC1(CNC(=O)NC2CCOC2C2=CC=C(Cl)C(F)=C2)COC1,FC1=CC(C2OCCC2NC2=NC=NC3=C2N=CN3)=CC=C1Cl,0.01,0.50,0.422535
369,CC(=O)C1=CC=CC=C1C(=O)NC1CCOC1C1=CC=C(Cl)C(F)=C1,FC1=CC(C2OCCC2NC2=NC=NC3=C2N=CN3)=CC=C1Cl,0.02,0.50,0.447761
370,CCN(CC)S(=O)(=O)NC1CCOC1C1=CC=C(Cl)C(F)=C1,FC1=CC(C2OCCC2NC2=NC=NC3=C2N=CN3)=CC=C1Cl,0.03,0.50,0.454545
371,CC(C)N(CC(C)(C)O)C(=O)NC1CCOC1C1=CC=C(Cl)C(F)=C1,FC1=CC(C2OCCC2NC2=NC=NC3=C2N=CN3)=CC=C1Cl,0.04,0.50,0.405405


In [37]:
df_train_pairs.iloc[:,:2].to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_pairs.txt"), sep=" ", header=None)

In [38]:
df_train_src = df_train_pairs.iloc[:,0]
df_train_src.to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_src.txt"), sep=" ", header=None)

In [39]:
df_train_tar = df_train_pairs.iloc[:,1]
df_train_tar.to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_tar.txt"), sep=" ", header=None)

In [40]:
list_smi_src = df_train_src.values.tolist()
list_smi_tar = df_train_tar.values.tolist()

In [41]:
list_chembl_unique = list(set(list_smi_src + list_smi_tar))
print(len(list_chembl_unique))

431


In [42]:
list_triplet = []
K = 20

for i, (smi_src, smi_tar) in tqdm.tqdm(enumerate(zip(list_smi_src, list_smi_tar)), total=len(list_smi_src)):
    batch_list_triplet = []
    ## mol
    mol_src = Chem.MolFromSmiles(smi_src)
    mol_tar = Chem.MolFromSmiles(smi_tar)
    ## fingerprint
    fp_src = AllChem.GetMorganFingerprintAsBitVect(mol_src, radius=2, nBits=2048, useChirality=False)
    fp_tar = AllChem.GetMorganFingerprintAsBitVect(mol_tar, radius=2, nBits=2048, useChirality=False)
    
    ## Shuffle
    random.shuffle(list_chembl_unique)
    
    for smi_chembl in list_chembl_unique:
        mol_chembl = Chem.MolFromSmiles(smi_chembl)
        fp_chembl = AllChem.GetMorganFingerprintAsBitVect(mol_chembl, radius=2, nBits=2048, useChirality=False)
        ## Tanimoto
        sim_src = DataStructs.TanimotoSimilarity(fp_src, fp_chembl)
        sim_tar = DataStructs.TanimotoSimilarity(fp_tar, fp_chembl)
        ## check
        if sim_src < 0.3 and sim_tar < 0.3:
            batch_list_triplet.append((smi_src, smi_tar, smi_chembl))
        ## stop
        if len(batch_list_triplet) == K:
            break
            
    if len(batch_list_triplet) < K:
        print(f"[WARNING] {i} has insufficient data ({len(batch_list_triplet)} < {K})")
        
    list_triplet.extend(batch_list_triplet)

100%|██████████| 373/373 [00:00<00:00, 398.43it/s]


In [43]:
df_triplet = pd.DataFrame(list_triplet)

df_triplet.to_csv(os.path.join(PROPERTY_NAME, "rdkit_train_triplet.txt"), sep=" ", header=None, index=False)