# Convert IUPAC to SMILES

In [None]:
import pandas as pd

from urllib.request import urlopen
from urllib.parse import quote

from tqdm import tqdm
tqdm.pandas()

In [None]:
def get_structure(IUPAC, # The IUPAC name of the compound
                  format="smi", # "smi" or "inchi"
                 ):
    """
    Convert IUPAC string to SMILES or InChI string
    
    """
    # Remove space of the string
    IUPAC = IUPAC.replace(" ", "")
    
    # Prepare the URL
    url = f'https://opsin.ch.cam.ac.uk/opsin/{IUPAC}.{format}'
    
    # Fetch the structure from the URL
    try:
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return 'Did not work'

In [None]:
# #from https://stackoverflow.com/questions/54930121/converting-molecule-name-to-smiles
# def get_structures(IUPAC):
#     "Convert IUPAC string to smiles or InChI"
    
#     try:
#         # url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
#         ids = ids.replace(" ","")
#         url = f'https://opsin.ch.cam.ac.uk/opsin/{ids}.smi'
#         #https://opsin.ch.cam.ac.uk/opsin/
#         ans = urlopen(url).read().decode('utf8')
#         return ans
#     except:
#         return 'Did not work'

In [None]:
df = pd.read_csv('data/iupac_quina.csv')

In [None]:
df

Unnamed: 0,example,IUPAC
0,1,"(1R,5R,6R)-3-(2-((1-((dimethylamino)methyl)cyclopropyl)methoxy)-7-(8-ethyl-7-fluoro-3- hydroxynaphthalen-1-yl)-8-fluoroquinazolin-4-yl)-3-azabicyclo[3.2.1]octan-6-ol"
1,2,"(R)-1-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro- 1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-3-methylpiperidin-3-ol"
2,3,"5-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro- 1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)tetrahydropyrrolo[3,4-c]pyrrole- 1,3(2H,3aH)-dione"
3,4,"6-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H- pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-6-azaspiro[3.5]nonan-2-ol"
4,5,"7-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H- pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-1,3,7-triazaspiro[4.5]decan-2-one"
...,...,...
171,167,"2-amino-4-(6-chloro-4-(3-chloro-7,8-dihydro-4H-[1,2,3]triazolo[1,5-a][1,4]diazepin-5(6H)-yl)- 8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-7-yl)-7- fluorobenzo[b]thiophene-3-carbonitrile"
172,168,"5-(7-(2-amino-3-cyano-7-fluorobenzo[b]thiophen-4-yl)-6-chloro-8-fluoro-2-(((2R,7aS)-2- fluorotetrahydro-1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-N,N-dimethyl-5,6,7,8- tetrahydro-[1,2,3]triazolo[4,5-c]azepine-2(4H)-carboxamide"
173,169,"8-(7-(2-amino-3-cyano-7-fluorobenzo[b]thiophen-4-yl)-6-chloro-8-fluoro-2-(((2R,7aS)-2- fluorotetrahydro-1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-N,N-dimethyl-6,7,8,9- tetrahydro-5H-[1,2,4]triazolo[1,5-a][1,4]diazepine-2-carboxamide"
174,170,"2-amino-4-(6-chloro-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H-pyrrolizin-7a(5H)- yl)methoxy)-4-(2-hydroxy-6-azaspiro[3.5]nonan-6-yl)quinazolin-7-yl)-7- fluorobenzo[b]thiophene-3-carbonitrile"


In [None]:
# df['SMILES'] = df.IUPAC.progress_apply(get_structure) # slow this way

In [None]:
# SMILES_list = [] # faster this way

# for i in tqdm(df.IUPAC):
#     SMILES = get_structure(i)
#     print(i,SMILES)
#     SMILES_list.append(SMILES)
#     break

In [None]:
%%time

SMILES_list = [] # faster this way

for i in tqdm(df.IUPAC):
    SMILES = get_structure(i)
    # print(i,SMILES)
    SMILES_list.append(SMILES)
    
    
df['SMILES'] = SMILES_list

100%|██████████| 176/176 [02:09<00:00,  1.36it/s]

CPU times: user 794 ms, sys: 454 ms, total: 1.25 s
Wall time: 2min 9s





In [None]:
df

Unnamed: 0,example,IUPAC,SMILES
0,1,"(1R,5R,6R)-3-(2-((1-((dimethylamino)methyl)cyclopropyl)methoxy)-7-(8-ethyl-7-fluoro-3- hydroxynaphthalen-1-yl)-8-fluoroquinazolin-4-yl)-3-azabicyclo[3.2.1]octan-6-ol",CN(C)CC1(CC1)COC1=NC2=C(C(=CC=C2C(=N1)N1C[C@H]2C[C@H]([C@@H](C1)C2)O)C2=CC(=CC1=CC=C(C(=C21)CC)F)O)F
1,2,"(R)-1-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro- 1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-3-methylpiperidin-3-ol",C(C)C=1C(=CC=C2C=C(C=C(C12)C1=CC=C2C(=NC(=NC2=C1F)OC[C@]12CCCN2C[C@@H](C1)F)N1C[C@@](CCC1)(O)C)O)F
2,3,"5-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro- 1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)tetrahydropyrrolo[3,4-c]pyrrole- 1,3(2H,3aH)-dione",C(C)C=1C(=CC=C2C=C(C=C(C12)C1=CC=C2C(=NC(=NC2=C1F)OC[C@]12CCCN2C[C@@H](C1)F)N1CC2C(C1)C(NC2=O)=O)O)F
3,4,"6-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H- pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-6-azaspiro[3.5]nonan-2-ol",C(C)C=1C(=CC=C2C=C(C=C(C12)C1=CC=C2C(=NC(=NC2=C1F)OC[C@]12CCCN2C[C@@H](C1)F)N1CC2(CC(C2)O)CCC1)O)F
4,5,"7-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H- pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-1,3,7-triazaspiro[4.5]decan-2-one",C(C)C=1C(=CC=C2C=C(C=C(C12)C1=CC=C2C(=NC(=NC2=C1F)OC[C@]12CCCN2C[C@@H](C1)F)N1CC2(CNC(N2)=O)CCC1)O)F
...,...,...,...
171,167,"2-amino-4-(6-chloro-4-(3-chloro-7,8-dihydro-4H-[1,2,3]triazolo[1,5-a][1,4]diazepin-5(6H)-yl)- 8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-7-yl)-7- fluorobenzo[b]thiophene-3-carbonitrile",NC1=C(C2=C(S1)C(=CC=C2C2=C(C=C1C(=NC(=NC1=C2F)OC[C@]21CCCN1C[C@@H](C2)F)N2CC=1N(CCC2)N=NC1Cl)Cl)F)C#N
172,168,"5-(7-(2-amino-3-cyano-7-fluorobenzo[b]thiophen-4-yl)-6-chloro-8-fluoro-2-(((2R,7aS)-2- fluorotetrahydro-1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-N,N-dimethyl-5,6,7,8- tetrahydro-[1,2,3]triazolo[4,5-c]azepine-2(4H)-carboxamide",NC1=C(C2=C(S1)C(=CC=C2C2=C(C=C1C(=NC(=NC1=C2F)OC[C@]21CCCN1C[C@@H](C2)F)N2CC=1C(CCC2)=NN(N1)C(=O)N(C)C)Cl)F)C#N
173,169,"8-(7-(2-amino-3-cyano-7-fluorobenzo[b]thiophen-4-yl)-6-chloro-8-fluoro-2-(((2R,7aS)-2- fluorotetrahydro-1H-pyrrolizin-7a(5H)-yl)methoxy)quinazolin-4-yl)-N,N-dimethyl-6,7,8,9- tetrahydro-5H-[1,2,4]triazolo[1,5-a][1,4]diazepine-2-carboxamide",NC1=C(C2=C(S1)C(=CC=C2C2=C(C=C1C(=NC(=NC1=C2F)OC[C@]21CCCN1C[C@@H](C2)F)N2CC=1N(CCC2)N=C(N1)C(=O)N(C)C)Cl)F)C#N
174,170,"2-amino-4-(6-chloro-8-fluoro-2-(((2R,7aS)-2-fluorotetrahydro-1H-pyrrolizin-7a(5H)- yl)methoxy)-4-(2-hydroxy-6-azaspiro[3.5]nonan-6-yl)quinazolin-7-yl)-7- fluorobenzo[b]thiophene-3-carbonitrile",NC1=C(C2=C(S1)C(=CC=C2C2=C(C=C1C(=NC(=NC1=C2F)OC[C@]21CCCN1C[C@@H](C2)F)N2CC1(CC(C1)O)CCC2)Cl)F)C#N


In [None]:
df[df.SMILES == 'Did not work']

Unnamed: 0,example,IUPAC,SMILES
10,11,"(R)-1-(2-((1-((dimethylamino)methyl)cyclopropyl)methoxy)-7-((R)-8-ethyl-7-fluoro-3- hydroxynaphthalen-1-yl)-6,8-difluoroquinazolin-4-yl)-3-methylpiperidin-3-ol",Did not work
11,12,"(R)-1-(2-((1-((dimethylamino)methyl)cyclopropyl)methoxy)-7-((S)-8-ethyl-7-fluoro-3- hydroxynaphthalen-1-yl)-6,8-difluoroquinazolin-4-yl)-3-methylpiperidin-3-ol",Did not work
17,18,"(2S,4s)-6-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-6-azaspiro[3.5]nonan-2-ol",Did not work
18,19,"(2R,4r)-6-(7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-6-azaspiro[3.5]nonan-2-ol",Did not work
62,63,"5-((R)-7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-N,N-dimethyl-5,6,7,8- tetrahydro-4H-pyrazolo[1,5-a][1,4]diazepine-2-carboxamide",Did not work
63,64,"5-((S)-7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-N,N-dimethyl-5,6,7,8- tetrahydro-4H-pyrazolo[1,5-a][1,4]diazepine-2-carboxamide",Did not work
64,65,"(S)-4-((S)-7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-6-methyl-1,4-oxazepan-6-ol",Did not work
65,66,"(S)-4-((R)-7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-6-methyl-1,4-oxazepan-6-ol",Did not work
66,67,"(1R,5R,6R)-3-((R)-7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-3-azabicyclo[3.2.1]octan-6-ol",Did not work
67,68,"(1R,5R,6R)-3-((S)-7-(8-ethyl-7-fluoro-3-hydroxynaphthalen-1-yl)-6,8-difluoro-2-(((2R,7aS)-2- fluorohexahydro-1H-pyrrolizin-7a-yl)methoxy)quinazolin-4-yl)-3-azabicyclo[3.2.1]octan-6-ol",Did not work


In [None]:
# #from https://stackoverflow.com/questions/54930121/converting-molecule-name-to-smiles
# def CIRconvert(IUPAC):
#     try:
#         # url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/smiles'
#         IUPAC = IUPAC.replace(" ", "")
#         url = f'https://cactus.nci.nih.gov/chemical/structure/{IUPAC}/smiles'
#         ans = urlopen(url).read().decode('utf8')
#         return ans
#     except:
#         return 'Did not work'

In [None]:
df.to_csv('data/smi_quina.csv',index=False)