In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp
import matplotlib as plt 
import pdb

In [2]:
!pip install tqdm
# adds loading bar to for loops when you wrap tqdm over an iterable
from tqdm import tqdm



In [7]:
def compound_finder(compounds, indentity):
    '''
    Downloads the defined compound properties from Pubchem
    
    compounds: list of compound identifiers (e.g. names or CIDs)
    
    identity: 'name', 'cid', ... see PubchemPy documentation for more
    '''
    
    df = pd.DataFrame()
    for compound in tqdm(compounds):
        temp = pcp.get_properties(['IUPACName',
                                   'CanonicalSMILES',
                                   'MolecularFormula', 
                               'MolecularWeight'], compound, indentity, as_dataframe=True)
        #print(temp)
        df = pd.concat([ df,temp])
    df.reindex()
    df.index = compounds
    return df

In [4]:
def compound_finder_name(compounds):
    '''
    downloads the compounds based on CID. 
    coumpounds: list of CIDs (integers).
    Use compounds = list(range(from, to))
    '''
    df = pd.DataFrame()
    d = dict()
    smiles = []
    synonyms = []
    for compound in tqdm(compounds):
        c = pcp.Compound.from_cid(compound)
        smile = c.canonical_smiles
        synonym = c.synonyms
        smiles.append(smile)
        synonyms.append(synonym)
    d['synonym'] = synonyms
    d['smiles'] = smiles
    
    df = pd.DataFrame(d)
    
    return(df)

In [5]:
comps = compound_finder_name([1, 2,3]) # to test the script only --> let students code the function, but in the 'real' exercise we must provide
# a bigger dataset as it takes a loooooong time to download the data from pubchem
print(comps)

100%|██████████| 3/3 [00:03<00:00,  1.34s/it]

                                             synonym  \
0  [Acetyl-DL-carnitine, DL-O-Acetylcarnitine, ac...   
1  [(2-acetyloxy-4-hydroxy-4-oxobutyl)-trimethyla...   
2  [5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic...   

                             smiles  
0  CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C  
1     CC(=O)OC(CC(=O)O)C[N+](C)(C)C  
2          C1=CC(C(C(=C1)C(=O)O)O)O  





In [8]:
compounds = compound_finder(list(range(1,100)), 'cid')


100%|██████████| 99/99 [01:02<00:00,  1.60it/s]


In [None]:
print(compounds.head())