Preliminary imports:

In [2]:
import getpass
import os
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_core.messages import BaseMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langgraph.graph import END, MessageGraph
from langchain_core.output_parsers import StrOutputParser
from typing import Annotated, List, Tuple, Union
from langchain.chains import create_history_aware_retriever

Setting up API keys:

In [4]:
OPENAI_API_KEY = "#"
LANGCHAIN_API_KEY = "#"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
output_parser = StrOutputParser()

States:

Class structure used to format code generation and pass error messages

In [23]:
from typing import Dict, TypedDict, List

class GraphState(TypedDict):
    """
    Represents the state of the graph.

    Attributes:
        error : Binary flag for control flow to indicate whether test error was tripped ("yes" or "no")
        messages : With user question, error messages, reasoning
        generation : Code solution
        iterations : Number of tries 
        code_result: Output of code solution once executed
    """

    error : str
    messages : List
    generation : str
    iterations : int
    code_result: str

Parser Agent:

In [172]:
llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model="gpt-4-0125-preview")
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     '''You are an expert at parsing information from requests regarding the BRENDA enzyme database. Based on \
     the BRENDA SOAP API documentation, analyze the following request and try to extrapolate EC numbers, organism names, and other relevant \
     information for querying the BRENDA SOAP API based on the question being asked. \
     Return the result in a JSON format.  \
     If the user mentions an enzyme, use background knowledge of the BRENDA database to assign that an EC number.\
     Organism names should be rewritten as binomial nomenclature. \
     Make sure you specify the actual question being requested by the user in a field called "user_question:". \
     Add any other relevant information needed for the BRENDA API call by creating new fields as needed. \
     Replace a field with None if nothing can be extrapolated for that field. \
     
     For example: if the user asks "What are the ligands associated with inhibition in DNA-directed DNA polymerase in humans?", 
     return:
     {{
        "user_question": "What are the ligands associated with inhibition in DNA-directed DNA polymerase in humans?",
        "EC_number": "2.7.7.7",
        "organism": "Homo sapiens",
        "ligand_properties:" "inhibition"
     }}
     In this example, ligand_properties is an example of a new field created for this specific question to fill in relevant information for answering the question. \
     In this case, it is describing the type of ligands to search for, and will not appear in every question.

     Another example: if the user asks "What is the optimum pH for aldose reductase in mice?", 
     return:
     {{
        "user_question": "What is the optimum pH for aldose reductase in mice?",
        "EC_number": "1.1.1.21",
        "organism": "Mus musculus",
        "additional_data:" "optimum pH values"
     }}
    '''),
    ("user", "{input}")
])


output_parser = StrOutputParser()

# output_parser transforms the output of the model just the string
chain = prompt | llm | output_parser
test1 = chain.invoke({"input":"What are the Ki values using DNA-directed DNA polymerase in humans?"})

print(test1)

{
  "user_question": "What are the Ki values using DNA-directed DNA polymerase in humans?",
  "EC_number": "2.7.7.7",
  "organism": "Homo sapiens",
  "additional_data": "Ki values"
}


Coder Agent

In [206]:
# using a docloader
# Load, chunk and index the contents of the blog.
import bs4
from langchain import hub
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import WebBaseLoader, UnstructuredHTMLLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.pydantic_v1 import BaseModel, Field

# loads documentation from web
loader = WebBaseLoader("https://www.brenda-enzymes.org/soap.php")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY))

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

# PROMPT FOR THE CODE GENERATOR
code_generator_prompt = ChatPromptTemplate.from_messages(
    [("system",'''You are a expert coding assistant who is proficient with the BRENDA SOAP API and Python 3. 
                                        
    Write code to print data that can then answer the user's question.
    Use only the following pieces of retrieved context from the BRENDA SOAP API documentation to answer the question. Do not retrieve data from any external sources to answer the question.
    Context: {context}
      
    Feel free to make as many or as few API calls as needed to the BRENDA SOAP API to answer the question.
    Make sure that any API calls are *ONLY* using the function names, syntax, and parameters *ACTUALLY DOCUMENTED* in the context BRENDA SOAP API documentation *VERBATIM*.
    If the question requires more than one API call to answer it, write multiple API calls. Ensure any code you provide can be executed with all required imports and variables defined.
    Structure your answer with a description of the code solution. Then list the imports. And finally list the functioning code block. 
    
    Make sure to include all of the code in the needed to run any API calls necessary, including:
        wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
        password = hashlib.sha256("PASSWORD".encode("utf-8")).hexdigest()
        client = Client(wsdl)

    The email for any BRENDA SOAP API calls made are "siavash.raissi@tufts.edu".
    Do not print anything other than the code solution.
    Make sure a print statement is included to print the answer to the user question in the terminal.
    
    Fill in *ALL* of the other parameters as needed according only to the BRENDA API documentation and examples.
    The priority is that that *EVERY* parameter is included as specified by the BRENDA SOAP API documentation, and the 'ecNumber*' and 'organism*' fields are filled out.
      
    If an error occurs with "Missing element", go through the function syntax in the context BRENDA API documentation, and for each parameter, sure that **EVERY** parameter is included in the corresponding code.
    Often, the parameter missing is NOT the one specified by the error.
                       
    For example, for the question: "What is the Ki values for enzyme 1.1.1.1 in humans?", the following code should be printed as according to the BRENDA SOAP API documentation:
        from zeep import Client
        import hashlib
        wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
        password = hashlib.sha256("PASSWORD".encode("utf-8")).hexdigest()
        client = Client(wsdl)
        parameters = ("siavash.raissi@tufts.edu",password,"ecNumber*1.1.1.1", "kiValue*", "kiValueMaximum*", "inhibitor*", "commentary*", "organism*Homo sapiens", "ligandStructureId*", "literature*")
        resultString = client.service.getKiValue(*parameters)
        print(resultString)

    Another example: for the user question: "What has a higher range of IC50 values: enzyme 1.1.1.1 or enzyme 1.1.1.2?", the following code should be printed as according to the BRENDA SOAP API documentation:
        import hashlib
        from zeep import Client
        wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
        password = hashlib.sha256("PASSWORD".encode("utf-8")).hexdigest()
        client = Client(wsdl)
        # For enzyme 1.1.1.1
        parameters_1 = ("siavash.raissi@tufts.edu", password, "ecNumber*1.1.1.1", "ic50Value*", "ic50ValueMaximum*", "inhibitor*", "commentary*", "organism*", "ligandStructureId*", "literature*")
        resultString_1 = client.service.getIc50Value(*parameters_1)
        # For enzyme 1.1.1.2
        parameters_2 = ("siavash.raissi@tufts.edu", password, "ecNumber*1.1.1.2", "ic50Value*", "ic50ValueMaximum*", "inhibitor*", "commentary*", "organism*", "ligandStructureId*", "literature*")
        resultString_2 = client.service.getIc50Value(*parameters_2)
        print(resultString_1, resultString_2)
      
    Another example: if the user asks "What inhibitors are known for the enzyme acetaldehyde dehydrogenase?", the following code should be printed as according to the BRENDA SOAP API documentation:
        import hashlib 
        wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
        password = hashlib.sha256("PASSWORD".encode("utf-8")).hexdigest()
        client = Client(wsdl)
        parameters = ("siavash.raissi@tufts.edu",password,"ecNumber*1.2.1.10", "organism*", "inhibitor*", "commentary*", "ligandStructureId*", "literature*")
        resultString = client.service.getInhibitors(*parameters)
        print(resultString)
      
    Another example: if the user asks: "What are the subunits in the enzyme pyruvate kinase in humans?", the following code should be printed as according to the BRENDA SOAP API documentation:
        import hashlib 
        wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
        password = hashlib.sha256("PASSWORD".encode("utf-8")).hexdigest()
        client = Client(wsdl)
        parameters = ("siavash.raissi@tufts.edu",password,"ecNumber*2.7.1.40", "subunits*", "organism*", "commentary*", "literature*")
        resultString = client.service.getSubunits(*parameters)
        print(resultString)

    Here is the user question:"'''),
    ("placeholder", "{messages}")]
)

# pydantic template for how to structure code - helps pinpoint errors
class code(BaseModel):
    """Code output"""

    prefix: str = Field(description="Description of the problem and approach")
    imports: str = Field(description="Code block import statements")
    code: str = Field(description="Code block not including import statements")
    description = "Schema for code solutions to questions using BRENDA."

# formats context (retriever for BRENDA documentation) for easy parsing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

code_generator_llm = ChatOpenAI(model="gpt-4-0125-preview", temperature=0, openai_api_key = OPENAI_API_KEY)
code_generator_chain = (
    code_generator_prompt
    | code_generator_llm.with_structured_output(code)
)

# test solution
# solution = code_generator_chain.invoke({"context": retriever | format_docs, "messages":[("user","What is the molecular weight of EC 1.1.1.2 in humans?")]})
# print(solution)

In [None]:
def agent_node(state, agent, name):
    result = agent.invoke(state)
    # We convert the agent output into a format that is suitable to append to the global state
    if isinstance(result, FunctionMessage):
        pass
    else:
        result = HumanMessage(**result.dict(exclude={"type", "name"}), name=name)
    return {
        "messages": [result],
        # Since we have a strict workflow, we can
        # track the sender so we know who to pass to next.
        "sender": name,
    }

Interpreter Agent

In [37]:

# TEST BRENDA API CALL
# -------------------------------------------------------------
# test, manually written BRENDA API call:
# querying the molecular weight of aldose reductase
#
# from zeep import Client
# import hashlib
# wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
# password = hashlib.sha256("PASSWORD".encode("utf-8")).hexdigest()
# client = Client(wsdl)
# parameters = ("siavash.raissi@tufts.edu",password,"ecNumber*1.1.1.21", "organism*Homo sapiens", 
#               "molecularWeight*", "molecularWeightMaximum*", "commentary*", "literature*")
# resultString = client.service.getMolecularWeight(*parameters)
#
# user_question = "What is the molecular weight of aldose reductase?"
# -------------------------------------------------------------

import functools

interpreterprompt = ChatPromptTemplate.from_messages([
    ("system", '''You are an expert at understanding and interpreting results from the BRENDA API. Use the data generated by BRENDA to answer the user's question \
     concisely.
     
     Here is the result of the BRENDA API call: {code_result}
     
     Here is the user question:'''),
     ("placeholder", "{question}")
])

output_parser = StrOutputParser()

# output_parser transforms the output of the model just the string
interpreterchain = interpreterprompt | llm
interpreternode = functools.partial(agent_node, agent=interpreterchain, name="interpret")


# test2 = chain.invoke({"user_question":user_question, "resultString":resultString})
# test1 = chain.invoke({"input":"What are the tissues using DNA-directed DNA polymerase in humans?"})
# print(resultString) 
# print(test2)
# accurately reports a range of 32,000-39,000 da 
# should it be more precise, give a consensus?

Merging Executor Agent with Coder Agent

In [198]:
from operator import itemgetter
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnablePassthrough
import io
from contextlib import redirect_stdout

# Max tries
max_iterations = 5

# Reflect
# flag = 'reflect'
flag = 'do not reflect'
 
### Nodes

# return to this function if the code has an error
def generate(state: GraphState):
    """
    Generate a code solution

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation
    """

    print("GENERATING CODE SOLUTION")
    
    # State
    messages = state["messages"]
    iterations = state["iterations"]
    error = state["error"]

    # generates again based off error BEFORE trying to generate the solution
    if error == "yes":
        messages += [("user","Now, try again, specifically targeting the error that was generated. Invoke the code tool to structure the output with a prefix, imports, and code block:")]
        
    # Solution
    code_solution = code_generator_chain.invoke({"context": retriever | format_docs, "messages": messages})
    messages += [("assistant",f"{code_solution.prefix} \n Imports: {code_solution.imports} \n Code: {code_solution.code}")]
    
    # Increment
    iterations = iterations + 1
    return {"generation": code_solution, "messages": messages, "iterations": iterations}

# EXECUTOR AGENT
def code_check(state: GraphState):
    """
    Check code

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, error
    """

    print("CHECKING CODE")
    
    # State
    messages = state["messages"]
    code_solution = state["generation"]
    iterations = state["iterations"]

    # Get solution components
    prefix = code_solution.prefix
    imports = code_solution.imports
    code = code_solution.code

    # Check if the import specifically fails
    try:
        exec(imports)
    except Exception as e:
        print("CODE IMPORT CHECK: FAILED")
        error_message = [("user", f'''Your solution failed the import test: {e}''')]
        messages += error_message
        return {"generation": code_solution, "messages": messages, "iterations": iterations, "error": "yes"}
    
    # Checks the non-import code in addition to imports
    try:
        exec(imports + "\n" + code)
    except Exception as e:
        print("CODE BLOCK CHECK: FAILED")
        error_message = [("user", f"Your solution failed the code execution test: {e}")]
        # if "Missing element" in str(e):
        #     error_message = [("user", f'''Your solution failed the code execution test. You have a parameter missing in your function call. 
        #                       Make sure every parameter in the function call corresponds to exactly those in the BRENDA SOAP API documentation.
        #                       For instance, if calling getKmValue, make sure that "literature*" and "kmValueMaximum*" is included in the parameters.''')]
        messages += error_message
        return {"generation": code_solution, "messages": messages, "iterations": iterations, "error": "yes"}
  
    # returns if no errors present
    print("NO CODE TEST FAILURES")
    
    # saves the errorless output of the code to code_result and returns
    print("SAVING RESULT")
    stdout = io.StringIO()
    with redirect_stdout(stdout):
        exec(imports + "\n" + code)
    code_result = stdout.getvalue()

    return {"generation": code_solution, "messages": messages, "iterations": iterations, "error": "no", "code_result": code_result}

# optional node used to reflect on error
def reflect(state: GraphState):
    """
    Reflect on errors

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation
    """

    print("GENERATING CODE SOLUTION")
    
    # State
    messages = state["messages"]
    iterations = state["iterations"]
    code_solution = state["generation"]

    # Prompt reflection
    reflection_message = [("user", """You tried to solve this problem and failed a unit test. Reflect on this failure
                                    given the provided documentation. Write a few key suggestions based on the 
                                    documentation to avoid making this mistake again.""")]
    
    # Add reflection
    reflections = code_generator_chain.invoke({"context": retriever | format_docs, "messages": messages})
    messages += [("assistant" , f"Here are reflections on the error: {reflections}")]
    return {"generation": code_solution, "messages": messages, "iterations": iterations}


# conditional edge for deciding when to finish
def decide_to_interpret(state: GraphState):
    """
    Determines whether to finish.

    Args:
        state (dict): The current graph state

    Returns:
        str: Next node to call
    """
    error = state["error"]
    iterations = state["iterations"]

    if error == "no" or iterations == max_iterations:
        print("FINISH")
        return "interpret"
    else:
        print("RETRY SOLUTION")
        if flag == 'reflect':
            return "reflect"
        else:
            return "generate"
        
def interpret(state: GraphState):
    """
    Interpret the code solution

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, messages
    """

    print("INTERPRETING CODE SOLUTION")
    
    # State
    messages = state["messages"]
    iterations = state["iterations"]
    code_solution = state["generation"]
    code_result = state["code_result"]

    if iterations == max_iterations and state["error"] == "yes":
        print(f"ERROR: solution could not be generated with max iterations: {max_iterations}")
    else:
        interpretation = interpreterchain.invoke({"code_result": code_result, "question": [messages[0]]})
        print(interpretation)

In [70]:
def agent_node(state, agent, name):
    result = agent.invoke(state)
    # We convert the agent output into a format that is suitable to append to the global state
    if isinstance(result, FunctionMessage):
        pass
    else:
        result = HumanMessage(**result.dict(exclude={"type", "name"}), name=name)
    return {
        "messages": [result],
        # Since we have a strict workflow, we can
        # track the sender so we know who to pass to next.
        "sender": name,
    }

Interpreter Agent

In [71]:

# TEST BRENDA API CALL
# -------------------------------------------------------------
# test, manually written BRENDA API call:
# querying the molecular weight of aldose reductase
#
# from zeep import Client
# import hashlib
# wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"
# password = hashlib.sha256("PASSWORD".encode("utf-8")).hexdigest()
# client = Client(wsdl)
# parameters = ("siavash.raissi@tufts.edu",password,"ecNumber*1.1.1.21", "organism*Homo sapiens", 
#               "molecularWeight*", "molecularWeightMaximum*", "commentary*", "literature*")
# resultString = client.service.getMolecularWeight(*parameters)
#
# user_question = "What is the molecular weight of aldose reductase?"
# -------------------------------------------------------------

import functools

interpreterprompt = ChatPromptTemplate.from_messages([
    ("system", '''You are an expert at understanding and interpreting results from the BRENDA API. Use the data generated by BRENDA to answer the user's question \
     concisely. Provide only the information being requested by the user.
     
     Here is the result of the BRENDA API call: {code_result}
     
     Here is the user question:
     ''')
])

output_parser = StrOutputParser()

# output_parser transforms the output of the model just the string
interpreterchain = interpreterprompt | llm | output_parser
interpreternode = functools.partial(agent_node, agent=interpreterchain, name="interpreter")


# test2 = chain.invoke({"user_question":user_question, "resultString":resultString})
# test1 = chain.invoke({"input":"What are the tissues using DNA-directed DNA polymerase in humans?"})
# print(resultString) 
# print(test2)
# accurately reports a range of 32,000-39,000 da 
# should it be more precise, give a consensus?

Connecting Graph Edges

In [None]:
from langgraph.graph import END, StateGraph

# stategraphs let nodes write to a shared state 
workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("generate", generate)  # generation solution
workflow.add_node("check_code", code_check)  # executor agent
workflow.add_node("reflect", reflect)  # reflect
workflow.add_node("interpret", interpret)  # interpret result

# Build graph
workflow.set_entry_point("generate")
workflow.add_edge("generate", "check_code")
# checks if decide_to_interpret results in an end, reflect, or generate and matches edges respectively
workflow.add_conditional_edges(
    "check_code",
    decide_to_interpret,
    {
        "interpret": "interpret",
        "reflect": "reflect",
        "generate": "generate",
    },
)

workflow.add_edge("reflect", "generate")
workflow.add_edge("interpret", END)


app = workflow.compile()
question = "What is the pH stability of 5’-3’ DNA Helicase, EC number 5.6.2.3?"
app.invoke({"messages":[("user",question)],"iterations":0})