In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp
import matplotlib as plt 

In [4]:
def compound_finder(compounds, indentity):
    '''
    Downloads the defined compound properties from Pubchem
    
    compounds: list of compound identifiers (e.g. names or CIDs)
    
    identity: 'name', 'cid', ... see PubchemPy documentation for more
    '''
    
    df = pd.DataFrame()
    for compound in compounds:
        temp = pcp.get_properties(['IUPACName',
                                   'CanonicalSMILES',
                                   'MolecularFormula', 
                               'MolecularWeight'], compound, indentity, as_dataframe=True)
        #print(temp)
        df = pd.concat([ df,temp])
    df.reindex()
    df.index = compounds
    return df

In [15]:
def compound_finder_name(compounds):
    '''
    downloads the compounds based on CID. 
    coumpounds: list of CIDs (integers).
    Use compounds = list(range(from, to))
    '''
    df = pd.DataFrame()
    d = dict()
    smiles = []
    synonyms = []
    
    for compound in compounds:
        c = pcp.Compound.from_cid(compound)
        smile = c.canonical_smiles
        synonym = c.synonyms
        smiles.append(smile)
        synonyms.append(synonym)
    d['synonym'] = synonyms
    d['smiles'] = smiles
    
    df = pd.DataFrame(d)
    
    return(df)

In [18]:
comps = compound_finder_name([1, 2,3]) # to test the script only --> let students code the function, but in the 'real' exercise we must provide
# a bigger dataset as it takes a loooooong time to download the data from pubchem
print(comps)

                Acetyl-DL-carnitine
0  CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
  (2-acetyloxy-4-hydroxy-4-oxobutyl)-trimethylazanium
0                      CC(=O)OC(CC(=O)O)C[N+](C)(C)C 


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


  5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic acid
0                           C1=CC(C(C(=C1)C(=O)O)O)O
  (2-acetyloxy-4-hydroxy-4-oxobutyl)-trimethylazanium  \
0                                                NaN    
0                      CC(=O)OC(CC(=O)O)C[N+](C)(C)C    
0                                                NaN    

  5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic acid  \
0                                                NaN   
0                                                NaN   
0                           C1=CC(C(C(=C1)C(=O)O)O)O   

                Acetyl-DL-carnitine  
0  CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C  
0                               NaN  
0                               NaN  


In [5]:
compounds = compound_finder(list(range(1,100)), 'cid')


In [6]:
print(compounds.head())

                    CanonicalSMILES  \
1  CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C   
2     CC(=O)OC(CC(=O)O)C[N+](C)(C)C   
3          C1=CC(C(C(=C1)C(=O)O)O)O   
4                           CC(CN)O   
5              C(C(=O)COP(=O)(O)O)N   

                                           IUPACName MolecularFormula  \
1        3-acetyloxy-4-(trimethylazaniumyl)butanoate         C9H17NO4   
2     (2-acetyloxy-3-carboxypropyl)-trimethylazanium        C9H18NO4+   
3  5,6-dihydroxycyclohexa-1,3-diene-1-carboxylic ...           C7H8O4   
4                                 1-aminopropan-2-ol           C3H9NO   
5         (3-amino-2-oxopropyl) dihydrogen phosphate         C3H8NO5P   

   MolecularWeight  
1          203.238  
2          204.246  
3          156.137  
4           75.111  
5          169.073  
