In [1]:
from rdkit.Chem import MolFromSmiles, MolToSmiles, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from Levenshtein import distance as lev # pip install python-Levenshtein
from langchain.agents import Tool
import pubchempy

In [2]:
def levenshtein(items: list, query: str) -> str:
    '''
    Given a list of strings
    and a query,
    return the list item with the smallest Levenshtein distance

    parameters:
        items: list of strings
        query: target string

    returns:
        closest string from the given list
    '''
    items_post = [s.replace(" ", "") for s in items] # ignore spaces
    lev_list = [lev(item,query, weights=(1,1,1)) for item in items_post] # get distances
    
    return items[lev_list.index(min(lev_list))] # return closest

x = ['iupac_name', 'cid', 'cactvs_fingerprint', 'canonical_smiles', 'inchi']
query = "canon SMILES"
print(query,'is the closest to',levenshtein(x,query))

query = "iupac"
print(query,'is the closest to',levenshtein(x,query))

x = [x[0] for x in Descriptors.descList[:124]]
query = "Count Heavy Atoms"
print(query,'is the closest to',levenshtein(x,query))

canon SMILES is the closest to cid
iupac is the closest to inchi
Count Heavy Atoms is the closest to NumHeteroatoms


In [3]:
def calculate_descriptors(compound: str, descriptors: list) -> list:
    '''
    Given a SMILES string and chosen descriptor(s),
    return the value for each descriptor

    parameters:
        compound: a chemical compound in SMILES format
        descriptors: a list of RDKit-compatible descriptors

    returns:
        list of computed properties
    '''
    all_compatible = [x[0] for x in Descriptors.descList[:124]]
    descriptors   = [levenshtein(all_compatible, x) for x in descriptors]
    print(f"Calculating {descriptors}")
    
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
    mol  = MolFromSmiles(compound)
    
    return list(calc.CalcDescriptors(mol)) # convert to list form

Calculate_Descriptors = Tool(
    name="calculate_descriptors",
    func=calculate_descriptors,
    description="""calculates chosen descriptors for a smiles. input should be json in the following format: `{{"compound":'<compound_smiles>', "descriptors":'[<descriptor_list>]'}}`"""
)

x = calculate_descriptors('CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F)cc2)n1CC[C@@H](O)C[C@@H](O)CC(=O)O',
                         ['Molar Weight', 'Heavy Atom Count', 'Number of Hydrogen Acceptors'])
print(f"Atorvastatin has {round(x[0],2)} ExactMolWt, {round(x[1],2)} HeavyAtomCount, and {round(x[2],2)} Hydrogen Acceptors")

Calculating ['MolWt', 'HeavyAtomCount', 'NumHAcceptors']
Atorvastatin has 558.65 ExactMolWt, 41 HeavyAtomCount, and 5 Hydrogen Acceptors


In [4]:
def convert_names(input_name: str, input_type: str, output_type: str) -> str:
    '''
    Given any compatible name
    and its type (anything from pubchem lowercase and using _ for space),
    return a different, chosen name.

    parameters:
        input_name: Any compatible name for a chemical compound
        input_type: The type of the inputted name
        output_type: The type of name for the output

    returns:
        a string with the desired chemical name
    '''
    all_compatible = ['name', 'canonical_smiles', 'inchi', 'inchikey', 'isomeric_smiles', 
                      'iupac_name', 'cid', 'cactvs_fingerprint', 'molecular_formula']
    input_type  = levenshtein(all_compatible, input_type)
    output_type = levenshtein(all_compatible, output_type)
    print(f"Converting {input_type} to {output_type}")
    
    if input_type[-7:] in ['_smiles','formula']: return # smiles search isn't working, and formula is not unique
    elif input_type == 'CID': compound = pubchempy.Compound.from_cid(int(input_name)) # directly instantiate
    else: compound = pubchempy.get_compounds(input_name,input_type)[0] # get first entry

    if output_type == 'name': return getattr(compound, 'synonyms')[0] # get first, common synonym
    return getattr(compound, output_type) # equivalent to compound.canonical_smiles or other types

Convert_Names = Tool(
    name="convert_names",
    func=convert_names,
    description="""converts one name for a compound to another. input should be json in the following format: `{{"input_name":'<compound_name>', "input_type":'<name_type>', "output_type":'<new_name_type>'}}`"""
)

x = convert_names(input_name='Atorvastatin',input_type='Name',output_type='Canonical SMILES')
print(f"Atorvastatin in SMILES format is {x}")

Converting name to canonical_smiles
Atorvastatin in SMILES format is CC(C)C1=C(C(=C(N1CCC(CC(CC(=O)O)O)O)C2=CC=C(C=C2)F)C3=CC=CC=C3)C(=O)NC4=CC=CC=C4
