In [16]:
import numpy as np
import pandas as pd
import pubchempy as pcp
import matplotlib as plt
# python debugger
import pdb
# for regular expression (regex)
import re

In [3]:
!pip install tqdm
# taqadum, the progress bar for python
from tqdm import tqdm



In [4]:
def compound_finder(compounds, indentity):
    '''
    Downloads the defined compound properties from Pubchem
    
    compounds: list of compound identifiers (e.g. names or CIDs)
    
    identity: 'name', 'cid', ... see PubchemPy documentation for more
    '''
    
    df = pd.DataFrame()
    for compound in tqdm(compounds):
        temp = pcp.get_properties(['IUPACName',
                                   'CanonicalSMILES',
                                   'MolecularFormula', 
                               'MolecularWeight'], compound, indentity, as_dataframe=True)
        #print(temp)
        df = pd.concat([ df,temp])
    df.reindex()
    df.index = compounds
    return df

In [5]:
def compound_finder_name(compounds):
    '''
    downloads the compounds based on CID. 
    coumpounds: list of CIDs (integers).
    Use compounds = list(range(from, to))
    '''
    df = pd.DataFrame()
    d = dict()
    smiles = []
    synonyms = []
    
    for compound in tqdm(compounds):
        c = pcp.Compound.from_cid(compound)
        smile = c.canonical_smiles
        synonym = c.synonyms
        smiles.append(smile)
        synonyms.append(synonym)
    d['synonym'] = synonyms
    d['smiles'] = smiles
    
    df = pd.DataFrame(d)
    
    return(df)

In [6]:
comps = compound_finder_name([1, 2,3]) # to test the script only --> let students code the function, but in the 'real' exercise we must provide
# a bigger dataset as it takes a loooooong time to download the data from pubchem
print(comps)

100%|██████████| 3/3 [00:03<00:00,  1.24s/it]

                                             synonym  \
0  [Acetyl-DL-carnitine, DL-O-Acetylcarnitine, ac...   
1  [(2-acetyloxy-4-hydroxy-4-oxobutyl)-trimethyla...   
2  [5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic...   

                             smiles  
0  CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C  
1     CC(=O)OC(CC(=O)O)C[N+](C)(C)C  
2          C1=CC(C(C(=C1)C(=O)O)O)O  





In [7]:
compounds = compound_finder(list(range(1,100)), 'cid')

100%|██████████| 99/99 [01:00<00:00,  1.80it/s]


In [8]:
# first 5 molecules
print(compounds.head())

                    CanonicalSMILES  \
1  CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C   
2     CC(=O)OC(CC(=O)O)C[N+](C)(C)C   
3          C1=CC(C(C(=C1)C(=O)O)O)O   
4                           CC(CN)O   
5              C(C(=O)COP(=O)(O)O)N   

                                           IUPACName MolecularFormula  \
1        3-acetyloxy-4-(trimethylazaniumyl)butanoate         C9H17NO4   
2     (2-acetyloxy-3-carboxypropyl)-trimethylazanium        C9H18NO4+   
3  5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ...           C7H8O4   
4                                 1-aminopropan-2-ol           C3H9NO   
5         (3-amino-2-oxopropyl) dihydrogen phosphate         C3H8NO5P   

   MolecularWeight  
1          203.238  
2          204.246  
3          156.137  
4           75.111  
5          169.073  


In [9]:
compounds.columns.values

array(['CanonicalSMILES', 'IUPACName', 'MolecularFormula',
       'MolecularWeight'], dtype=object)

In [10]:
head = compounds.head()
type(head)
# 3 ways of accessing column 'MolecularFormula'
print(head['MolecularFormula'])
print(head.loc[:, 'MolecularFormula'])
print(head.iloc[:, 2])

1     C9H17NO4
2    C9H18NO4+
3       C7H8O4
4       C3H9NO
5     C3H8NO5P
Name: MolecularFormula, dtype: object
1     C9H17NO4
2    C9H18NO4+
3       C7H8O4
4       C3H9NO
5     C3H8NO5P
Name: MolecularFormula, dtype: object
1     C9H17NO4
2    C9H18NO4+
3       C7H8O4
4       C3H9NO
5     C3H8NO5P
Name: MolecularFormula, dtype: object


In [11]:
ring = head[head['MolecularFormula'] == 'C9H17NO4']
non_ring = head[head['MolecularFormula'] != 'C9H17NO4']
print(f"## ring: ##\n\n{ring}")
print(f"## non_ring: ##\n\n{non_ring}")

## ring: ##

                    CanonicalSMILES  \
1  CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C   

                                     IUPACName MolecularFormula  \
1  3-acetyloxy-4-(trimethylazaniumyl)butanoate         C9H17NO4   

   MolecularWeight  
1          203.238  
## non_ring: ##

                 CanonicalSMILES  \
2  CC(=O)OC(CC(=O)O)C[N+](C)(C)C   
3       C1=CC(C(C(=C1)C(=O)O)O)O   
4                        CC(CN)O   
5           C(C(=O)COP(=O)(O)O)N   

                                           IUPACName MolecularFormula  \
2     (2-acetyloxy-3-carboxypropyl)-trimethylazanium        C9H18NO4+   
3  5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ...           C7H8O4   
4                                 1-aminopropan-2-ol           C3H9NO   
5         (3-amino-2-oxopropyl) dihydrogen phosphate         C3H8NO5P   

   MolecularWeight  
2          204.246  
3          156.137  
4           75.111  
5          169.073  


In [20]:
def contains_ringstruct(smiles):
    return re.search('C\d', smiles)

ring = pd.DataFrame()
non_ring = pd.DataFrame()
for idx, row in tqdm(compounds.iterrows()):
    if contains_ringstruct(row['CanonicalSMILES']):
        #pdb.set_trace()
        # ignore_index=True to start counting index anew instead of copying the original index 
        ring = ring.append(row, ignore_index=True)
    else:
        non_ring = non_ring.append(row, ignore_index=True)
print(f"## ring: ##\n\n{ring}")
print()
print(f"## non_ring: ##\n\n{non_ring}")

99it [00:00, 419.30it/s]

## ring: ##

                                      CanonicalSMILES  \
0                            C1=CC(C(C(=C1)C(=O)O)O)O   
1           C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl   
2                                CCN1C=NC2=C1N=CN=C2N   
3                   C1(C(C(C(C(C1O)O)OP(=O)(O)O)O)O)O   
4   C1C(NC2=C(N1)NC(=NC2=O)N)CN(C=O)C3=CC=C(C=C3)C...   
5                               C1=C(C=C(C(=C1O)O)O)O   
6                               C1=CC(=C(C=C1Cl)Cl)Cl   
7             CCCCCC(=O)C=CC1C(CC(=O)C1CCCCCCC(=O)O)O   
8                  CC12CCC(=O)CC1CCC3C2CCC4(C3CCC4O)C   
9                            C1CCC(=O)NCCCCCC(=O)NCC1   
10                           C1C=CC(=NC1C(=O)O)C(=O)O   
11                          C1=CC(=C(C(=C1)O)O)C(=O)O   
12                        C1=CC(=C(C(=C1)O)O)CCC(=O)O   
13     C(C(C(C1(C(N(C2C(N1O)C(=O)NC(=N2)N)O)O)O)O)O)O   
14        C(C(C(C1C(N(C2=C(N1)C(=O)N=C(N2)N)O)O)O)O)O   
15             CC(=O)C(=O)C1=NC2C(N=C(NC2=O)N)N(C1O)O   
16  C(C(C(C1=NC2=C


