In [1]:
import os
import sys
import textdistance
import pandas as pd
import selfies as sf
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import rdFMCS

In [5]:
DATA_PATH = os.getcwd()
PRO_PATH = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
DATA_TYPE = "qed" # or "drd2"

In [9]:
TOKENIZER_PATH = os.path.join(PRO_PATH, 'fairseq_mo', 'utils')
if TOKENIZER_PATH not in sys.path:
    sys.path = [TOKENIZER_PATH] + sys.path
from tokenizer import branch_based_standardization, selfies_tokenizer

# functions

In [5]:
def mol2smi(mol, rootedAtAtom=-1):
    return Chem.MolToSmiles(mol, isomericSmiles=False, kekuleSmiles=True, rootedAtAtom=rootedAtAtom, canonical=True, doRandom=False)

In [6]:
def reorder_atoms_by_mcs(smi_1, smi_2):
    mol_1 = Chem.MolFromSmiles(smi_1)
    mol_2 = Chem.MolFromSmiles(smi_2)
    
    res = rdFMCS.FindMCS([mol_1, mol_2], completeRingsOnly=True)
    mol_mcs = Chem.MolFromSmarts(res.smartsString)
    
    match_1 = mol_1.GetSubstructMatch(mol_mcs)
    match_2 = mol_2.GetSubstructMatch(mol_mcs)
    
    new_1 = [mol2smi(mol_1, i) for i in match_1]
    new_2 = [mol2smi(mol_2, i) for i in match_2]
    
    return new_1, new_2

In [14]:
def get_list_selfies_tokens(smiles_list):
    sel_tokens_list = list()
    for smi in smiles_list:
        try:
            sel = sf.encoder(smi)
            # use fragment based tokenization
            sel_tokens = selfies_tokenization(sel)
            # use char based tokenization
            #sel_tokens = atomwise_tokenizer(sel)
            sel_tokens_list.append(sel_tokens)
        except (sf.exceptions.SMILESParserError, sf.exceptions.EncoderError) as e:
            sel_tokens_list.append(None)
    return sel_tokens_list

In [19]:
def get_lev_js_from_string(source_frags, target_frags):
    dis = textdistance.levenshtein.distance(source_frags, target_frags)
    sim = textdistance.levenshtein.similarity(source_frags, target_frags)
    sim_norm = textdistance.levenshtein.normalized_similarity(source_frags, target_frags)
    js = textdistance.jaccard.similarity(source_frags, target_frags)
    return dis, sim, sim_norm, js

# data augmentation

In [15]:
low_df = pd.read_csv(os.path.join(DATA_PATH, DATA_TYPE, "ori_data", "rdkit_train_src.txt"), names=["low"])
high_df = pd.read_csv(os.path.join(DATA_PATH, DATA_TYPE, "ori_data", "rdkit_train_tar.txt"), names=["high"])
low_smiles = low_df["low"].tolist()
high_smiles = high_df["high"].tolist()

In [35]:
def get_candidates():
    low_df = pd.read_csv(os.path.join(DATA_PATH, DATA_TYPE, "ori_data", "rdkit_train_src.txt"), names=["low"])
    high_df = pd.read_csv(os.path.join(DATA_PATH, DATA_TYPE, "ori_data", "rdkit_train_tar.txt"), names=["high"])
    low_smiles = low_df["low"].tolist()
    high_smiles = high_df["high"].tolist()
    candidates = list()
    for pair in tqdm(zip(low_smiles, high_smiles), total=len(low_smiles)):
        mcs_based_low, mcs_based_high = reorder_atoms_by_mcs(pair[0], pair[1])
        for low in mcs_based_low:
            low = branch_based_standardization(low) # doing standardization
            low_sel_tokens = get_list_selfies_tokens([low])[0]
            if low_sel_tokens is None:
                continue
            for high in mcs_based_high:
                high = branch_based_standardization(high) # doing standardization
                high_sel_tokens = get_list_selfies_tokens([high])[0]
                if high_sel_tokens is None:
                    continue
                dis, sim, sim_norm, js = get_lev_js_from_string(low_sel_tokens, high_sel_tokens)
                candidates.append((low, high, dis, sim, sim_norm, js))
        break
    candidates_df = pd.DataFrame(candidates, columns=["low", "high", "lev", "sim", "sim_norm", "js"])
    candidates_df.to_pickle(os.path.join(DATA_PATH, DATA_TYPE, "aug_data", "candidates.pkl"))
    return candidates_df

In [36]:
candidates = get_candidates()

  0%|                                                                                                                                                                                 | 0/88306 [00:00<?, ?it/s]


In [39]:
filtered_df = candidates[(candidates["sim_norm"]>0.5) & (candidates["lev"]>1) & (candidates["lev"]<=10)]

In [40]:
filtered_df

Unnamed: 0,low,high,lev,sim,sim_norm,js
54,C1CC(C(C)(C)C)CC2=C1C(C(N)=O)=C(S2)NC(=O)COC1=...,C1CC(C)CC2=C1C(C(N)=O)=C(S2)NC(=O)C1=CC=CO1,3,6,0.666667,0.545455
56,C1CC(C(C)(C)C)CC2=C1C(C(N)=O)=C(S2)NC(=O)COC1=...,C12=C(CCC(C)C1)C(C(N)=O)=C(S2)NC(=O)C1=CC=CO1,4,5,0.555556,0.416667
57,C1CC(C(C)(C)C)CC2=C1C(C(N)=O)=C(S2)NC(=O)COC1=...,C1C2=C(CCC1C)C(C(N)=O)=C(S2)NC(=O)C1=CC=CO1,4,5,0.555556,0.416667
105,C1C2=C(CCC1C(C)(C)C)C(C(N)=O)=C(S2)NC(=O)COC1=...,C1CC(C)CC2=C1C(C(N)=O)=C(S2)NC(=O)C1=CC=CO1,4,5,0.555556,0.416667
107,C1C2=C(CCC1C(C)(C)C)C(C(N)=O)=C(S2)NC(=O)COC1=...,C12=C(CCC(C)C1)C(C(N)=O)=C(S2)NC(=O)C1=CC=CO1,4,5,0.555556,0.416667
108,C1C2=C(CCC1C(C)(C)C)C(C(N)=O)=C(S2)NC(=O)COC1=...,C1C2=C(CCC1C)C(C(N)=O)=C(S2)NC(=O)C1=CC=CO1,4,5,0.555556,0.416667
180,C(N)(=O)C1=C(SC2=C1CCC(C2)C(C)(C)C)NC(=O)COC1=...,C(N)(=O)C1=C(SC2=C1CCC(C)C2)NC(=O)C1=CC=CO1,4,5,0.555556,0.454545


In [42]:
remake_low = list()
remake_high = list()
for _, item in tqdm(filtered_df.iterrows(), total=len(filtered_df)):
    low_mol = Chem.MolFromSmiles(item["low"])
    high_mol = Chem.MolFromSmiles(item["high"])
    if low_mol is None or high_mol is None:
        continue
    else:
        remake_low.append(item["low"])
        remake_high.append(item["high"])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 2789.30it/s]
