In [41]:
from SmilesPE.pretokenizer import atomwise_tokenizer
import os
import pandas as pd
from rdkit.Chem import MolFromSmiles, MolToSmiles
from transformers import AutoTokenizer, AutoConfig, AutoModel
import json

In [34]:
MODEL_NAME = 'seyonec/PubChem10M_SMILES_BPE_180k'
MODEL_PATH = './model/'

In [13]:
smi = 'CC[N+](C)(C)Cc1ccccc1Br'
toks = atomwise_tokenizer(smi)
print(toks)

['C', 'C', '[N+]', '(', 'C', ')', '(', 'C', ')', 'C', 'c', '1', 'c', 'c', 'c', 'c', 'c', '1', 'Br']


In [14]:
smiles_df = pd.read_csv('data/sider_smiles.csv')

In [15]:
smiles_df.head()

Unnamed: 0,cid,name,smiles
0,85,carnitine,C[N+](C)(C)CC(CC(=O)O)O
1,119,gamma-aminobutyric,C(CC(=O)O)CN
2,137,5-aminolevulinic,C(CC(=O)O)C(=O)CN
3,143,leucovorin,C1C(N(C2=C(N1)NC(=NC2=O)N)C=O)CNC3=CC=C(C=C3)C...
4,146,5-methyltetrahydrofolate,CN1C(CNC2=C1C(=O)N=C(N2)N)CNC3=CC=C(C=C3)C(=O)...


In [35]:
def make_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}', normalization=True, caeche_dir='./cache')
    tokenizer.save_pretrained(f'{MODEL_PATH}')
    print(f'tokenizer object load & save......[{MODEL_NAME}]')

    example = "Cn1c(=O)c2c(ncn2C)n(C)c1=O"
    tokens = tokenizer(example) # example
    tokens_ = tokenizer.tokenize(example) # example
    print(tokens)
    print(tokens_)
    
    return tokenizer

In [39]:
mol = MolFromSmiles(smiles_df['smiles'][0])
smiles = MolToSmiles(mol,doRandom=True)
MAKE_PER_MOL = 10

smiles_list = []
for cid, name, smi in zip(smiles_df['cid'], smiles_df['name'], smiles_df['smiles']):
    new_smiles = []
    mol = MolFromSmiles(smi)
    for i in range(MAKE_PER_MOL):
        new_smiles.append(MolToSmiles(mol,doRandom=True))
    smiles_list.append({
        'cid': cid,
        'name': name,
        'smiles': new_smiles
    })



In [42]:
with open('data/sider_smiles.json', 'w') as f:
    json.dump(smiles_list, f)

In [7]:
sider_names['cid'].replace('CID\d', '', regex=True, inplace=True)
sider_names['cid'] = sider_names['cid'].astype(int)

In [8]:
sider_smiles = []

for i, row in tqdm(sider_names.iterrows(), total=len(sider_names)):
    try:
        compound = pcp.Compound.from_cid(row["cid"])
        sider_smiles.append(
            {"cid": row["cid"], "name": row["name"], "smiles": compound.isomeric_smiles},
        )
    except:
        print("Error: ", row["cid"], row["name"])
        pass


100%|██████████| 1430/1430 [24:48<00:00,  1.04s/it]


In [9]:
sider_smiles_df = pandas.DataFrame(sider_smiles)
sider_smiles_df.to_csv("./data/sider_smiles.csv", index=False)