# LLM agents for literature review
Get information about scientific papers and their citations, and check if they are relevant to the PSC disease

## Import libaries and prepare the API credential

In [1]:
pip install Biopython

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pyautogen

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install neo4j

Note: you may need to restart the kernel to use updated packages.


In [4]:

import requests
from Bio import Entrez
from typing import List

from autogen import ConversableAgent, register_function, GroupChatManager, GroupChat
from autogen.coding import LocalCommandLineCodeExecutor


from time import sleep

with open('openai.credential', 'r') as file:
    key = file.read()
MODEL = 'gpt-4o'

flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.


## A nested tool agents to retrieve paper info

### Python functions 

A set of functions to get citations and abstracts of a paper, search for a paper from its title, and covert between DOIs and PubMed IDs.


In [5]:
def get_citations(doi:str) -> str:
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        references = data.get("message", {}).get("reference", [])
        return ", ".join([ref.get('DOI', 'Unknown DOI') for ref in references])
    else:
        print(f"Error: {response.status_code}")
        return ""
    

In [6]:
def get_title_abstract(pmid: str) -> str:

    Entrez.email = 'hui.song@sintef.no' 
    print("found pmid---->", pmid)
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
    records = Entrez.read(handle)
    handle.close()
    print("find item")
    # Step 3: Parse the XML to extract title and abstract
    article = records['PubmedArticle'][0]['MedlineCitation']['Article']
    
    # Extract the title
    title = article.get('ArticleTitle', 'No title available.')
    
    # Extract the abstract
    abstract_paragraphs = article.get('Abstract', {}).get('AbstractText', [])
    
    if isinstance(abstract_paragraphs, list):
        abstract = ' '.join(str(para) for para in abstract_paragraphs)
    else:
        abstract = str(abstract_paragraphs)
    
    return "Title: {} \nAbstract: {}".format(title, abstract)
    

In [7]:

def get_pmid_from_doi(doi: str)->str:

    Entrez.email = 'hui.song@sintef.no'  

    handle = Entrez.esearch(db='pubmed', term='"{}" [DOI]'.format(doi))
    record = Entrez.read(handle)
    handle.close()

    pmid_list = record.get('IdList', [])

    if pmid_list:
        return repr(pmid_list[0])
    else:
        return 'No PMID found for DOI: {}'.format(doi)
    

def get_pmid_from_title(title:str)->str:
    """
    Searches PubMed for a paper by title and returns the PMID.

    Parameters:
    title (str): The title of the paper.

    Returns:
    str: The PMID of the paper if found.
    """
    # Set your email (required by NCBI)
    Entrez.email = "hui.song@sintef.no"  # Replace with your email address

    # Search for the paper using its title
    search_handle = Entrez.esearch(db="pubmed", term=title, retmode="xml")
    search_results = Entrez.read(search_handle)
    search_handle.close()

    # Get the list of PubMed IDs
    id_list = search_results.get("IdList", [])

    if not id_list:
        print("No articles found with the given title.")
        return ""
    else:
        # Return the first PMID (assuming it's the most relevant)
        return id_list[0]

def get_doi_from_pmid(pmid:str)->str:
    """
    Fetches the DOI of a paper given its PMID.

    Parameters:
    pmid (str): The PubMed ID of the paper.

    Returns:
    str or None: The DOI of the paper if found, else None.
    """
    # Set your email (required by NCBI)
    Entrez.email = "hui.song@sintef.no"  # Replace with your email address

    # Fetch the article details
    fetch_handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    fetch_results = Entrez.read(fetch_handle)
    fetch_handle.close()

    # Extract the DOI from the article data
    try:
        article = fetch_results['PubmedArticle'][0]
        article_ids = article['PubmedData']['ArticleIdList']

        for id_element in article_ids:
            if id_element.attributes.get('IdType') == 'doi':
                return str(id_element)
        print("DOI not found in the article data.")
        return ""
    except (IndexError, KeyError) as e:
        print(f"Error retrieving DOI: {e}")
        return ""
    
### This function has some problem and are currently not used ###
def get_citations_pubmed(pmid:str)->str:
    Entrez.email = "hui.song@sintef.no"
    links = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed_refs")
    record = Entrez.read(links)
	
    records = record[0][u'LinkSetDb'][0][u'Link']
    link_list = []
    for link in records:
        link_list.append(link[u'Id'])

    print(link_list)
    for index, id in enumerate(link_list):
        print(id)
        handle = Entrez.efetch(db='pubmed', id=id, retmode='xml')
        records = Entrez.read(handle)
        handle.close()
    
    # Step 3: Parse the XML to extract title and abstract
        article = records['PubmedArticle'][0]['MedlineCitation']['Article']
    
    # Extract the title
        title = article.get('ArticleTitle', 'No title available.')
        print(index, title)

### The following two functions are not necessary (the agents can figure out by itself how to combine the functions), but it will help save some tokens.
def get_title_abstract_from_doi(doi:str)->str:
    for i in (0,4):
        try:
            pmid = get_pmid_from_doi(doi)
            print("pmid------>", pmid)
            sleep(2)
            return "DOI: {}\n {}".format(doi, get_title_abstract(pmid))
        except:
            sleep(3)
            continue

    return "could not find the paper from DOI: {}".format(doi)

def get_doi_from_title(title:str)->str:
    pmid = get_pmid_from_title(title)
    return get_doi_from_pmid(pmid)

### Tool executor (just to wrap up the function)

In [8]:

lib_tool_executor = ConversableAgent(
    "lib_tool_executor",
    llm_config=False,  # Turn off LLM for this agent.
    code_execution_config=False,
    is_termination_msg=lambda msg: (msg["content"]) and ("terminate" in msg["content"].lower())
)


### Tool driver (caller of the function)

In [9]:
lib_tool_driver = ConversableAgent(
    "lib_tool_driver",
    system_message = "You are a helpful assistant to retrieve information of scientific papers, using some external tools search papers from their titles, get their citations, titles, abstracts, etc. The papers are in most cases identified by their DOIs"
    "Once you received the result you needed from the tools, or you believe the result from the tools satisfy the task you were assigned, say 'TERMINATE' immediately. Do not try to repeat, summarize or analyze the results. Also, try to do one task each time. Once a task or subtask assigned to you was done, say 'TERMINATE', rather than jumpting into the next task immediately",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    code_execution_config=False
)

### Register the tool to both caller and executor


In [10]:
register_function(
    get_title_abstract,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Get the title and abstract of a paper. The input is the PMID (pubmed id) of this paper"
)

# register_function(
#     get_pmid_from_doi,
#     caller = lib_tool_driver,
#     executor = lib_tool_executor,
#     description = "Get the PMID (pubmed id) of a paper from its DOI as input"
# )

# register_function(
#     get_doi_from_pmid,
#     caller = lib_tool_driver,
#     executor = lib_tool_executor,
#     description = "Get the DOI of a paper from its PMID (pubmed id) as input"
# )

register_function(
    get_pmid_from_title,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Search a paper from its title. The input is the title, and the output is the PMID of this paper"
)

register_function(
    get_citations,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Get the citations of a paper. The input is the DOI of the source paper, and the output is a list of DOIs of the papers that the source paper has cited"
)

register_function(
    get_doi_from_title,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Search a paper from its title. The input is the title and the putput is the DOI of the found paper"
)

register_function(
    get_title_abstract_from_doi,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Get the title and abstract of a paper. The input is the DOI of this paper"
)


### Make the nested agent for tools. 
Now lib_tool_executor is the only one exposed to the group

In [11]:
def second_last_msg(sender: ConversableAgent, recipient: ConversableAgent, summary_args: dict):
    return sender.chat_messages[recipient][-2]["content"]

nested_chats = [
    {
        "recipient": lib_tool_driver,
        "max_turns": 4,
        "summary_method": second_last_msg
    }
]

lib_tool_executor.register_nested_chats(
    nested_chats, 
    trigger = lambda sender: sender not in [lib_tool_driver]
)

## Agent to the read the abstract and check if the paper is about PSC or not.

In [12]:
chatbot_system_message = "You are a helpful assistant. You will receive the abstract of a scientific paper, and you judge if this paper is about a research for the PSC (Primary Sclerosing Cholangitis) disease. Please answer with three lines: 1) 'My review for paper <it's DOI>:'; 2) simply 'YES' or 'No'; 3) If it is, please describe briefly what the research is about (in on sentence), If not, please give a short explanation why you don't think it is."

psc_checker = ConversableAgent(
    "psc_checker",
    system_message = chatbot_system_message,
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    code_execution_config=False,  # Turn off code execution for this agent.
)

## Note taking agents
Tools to save key information into a knowledge graph, and query the graph for saved information.

In [13]:
from neo4j import GraphDatabase

gdb = GraphDatabase.driver('neo4j://neo4j', auth = ("neo4j", "llm-agents"))
gdb.verify_connectivity()

def query_neo4j(query:str)->str:

    try:
        records, summary, keys = gdb.execute_query(query)
        return repr(records)
    except Exception as e:
        return repr(e) 
    

graph_tool_executor = ConversableAgent(
    "graph_tool_executor",
    llm_config=False,  # Turn off LLM for this agent.
    code_execution_config=False,
    is_termination_msg=lambda msg: (msg["content"]) and ("terminate" in msg["content"].lower())
)

graph_tool_driver = ConversableAgent(
    "graph_tool_driver",
    system_message = "You are taking notes about scientific paper mentiond by others."
        "Each paper is identified by DOI, and may have title and abstract."
        "You have a tool helping you saving paper information to a Neo4j graph, and to query informaiton you saved"
        "When you receive a paper, with DOI, title or abstract, save them."
        "Once it is succesfully save, say TERMINATE to avoid the tool executing unnecessary tasks"
        "When you were asked about papers you saved, query the Neo4j graph via the tool."
        "Once you received the satisfied results from the query, summarize them in concise text," "followed by 'TERMINATE' to stop the tool from querying more.",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    code_execution_config=False
)

register_function(
    query_neo4j,
    caller = graph_tool_driver,
    executor = graph_tool_executor,
    description = "Query or modify the neo4j graph database. The input is a cypher query, and the output is a list of records returned from the query. The graph has the following schema: There are only two labels for nodes: 'Paper' and 'Review'. "
        "A Paper has properties: doi (its id), title, and abstract"
        "A Review has properties: isPsc (YES, NO), reviewdBy, and comment"
        "Two labels for relations: 'Cite' (a paper cite another paper), and 'ReviewOf' (a Review is a review of a Paper)"
        "Each node or relation should have one and only one label"
        "Note that The property existence syntax `... exists(variable.property)` is no longer supported by new Cypher. Please use `variable.property IS NOT NULL` instead."
        "Remember that DOI is the id of Paper. When you search or match a paper, try to use DOI as long as it is available."
)

nested_chats = [
    {
        "recipient": graph_tool_driver,
        "max_turns": 4,
        "summary_method": "last_msg"
    }
]

graph_tool_executor.register_nested_chats(
    nested_chats, 
    trigger = lambda sender: sender not in [graph_tool_driver]
)

## Create the group of agents to conduct the task

### A moderator agent to plan and guide the group chat

In [14]:
moderator = ConversableAgent(
    "moderator",
    system_message = "You are an AI-based moderator that makes plans for the whole group."
    "When you get a task, break down it into sub tasks, and each can be performed by one of your 'partner agents'."
    "You will get an introduction about what each of your partner agents can do.",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    code_execution_config=False,  # Turn off code execution for this agent.
    human_input_mode = "ALWAYS"
)


### Create the group chat and assign it to a group manager

In [15]:
lib_tool_executor.description = "I have a couple of tools to fetch paper informaiton from PubMed. I can search the DOI or PubMed Id from title, get the abstract of a paper, and find the citations of the paper"
psc_checker.description = "I can read a paragraph of English text (such as the abstract of a paper), and try to tell if the paper is about the research on the PSC disease"
moderator.description = "I am the moderator, and I break down a task into subtasks. I should always be the one to speak first, to make plan for sub-tasks"
graph_tool_executor.description = "I am the note-taker. I record the papers, citations and reviews mentioned by other speakers in a database, and help you query from what I noted."


group_chat = GroupChat(
    agents = [lib_tool_executor, psc_checker, graph_tool_executor, moderator],
    messages = [],
    max_round = 20,
    send_introductions = True,
    allow_repeat_speaker = False
)

group_chat_manager = GroupChatManager(
    groupchat = group_chat,
    system_message = "You are an assistent to manage who speak next. ",
        # "Remember that once a paper is mentioned, or its abstracts or citations, or the review of a paper, is posted in the graph chat, you should ask the note-taker to speak next, so that it records the paper in the database",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]}
)



## Initialize the knowledge graph with on sample paper

In [16]:
chat_result = lib_tool_executor.initiate_chat(
    group_chat_manager,
    max_round = 6,
    message = "Let's start with paper 10.1126/scitranslmed.abb3107. Please get its title and abstract. After that, save the following review to the existing paper with DOI 10.1126/scitranslmed.abb3107: Xiaojun has reviewed the paper (DOI: 10.1126/scitranslmed.abb3107) and judges that is a paper about PSC, and she commented that it is one of the primary work in this field. For this paper, no need to ask an additional opinion on whether it is about PSC or not."
)

[33mlib_tool_executor[0m (to chat_manager):

Let's start with paper 10.1126/scitranslmed.abb3107. Please get its title and abstract. After that, save the following review to the existing paper with DOI 10.1126/scitranslmed.abb3107: Xiaojun has reviewed the paper (DOI: 10.1126/scitranslmed.abb3107) and judges that is a paper about PSC, and she commented that it is one of the primary work in this field. For this paper, no need to ask an additional opinion on whether it is about PSC or not.

--------------------------------------------------------------------------------
[32m
Next speaker: moderator
[0m
[31m
>>>>>>>> NO HUMAN INPUT RECEIVED.[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mmoderator[0m (to chat_manager):

Sure, let's break this down into smaller tasks:

1. **Fetch Paper Information**: Use the DOI 10.1126/scitranslmed.abb3107 to get the title and abstract of the paper.
   - Assigned to: `lib_tool_executor`

2. **Store Review Information**: Once we have the data from t

NameError: name 'cerebras_AuthenticationError' is not defined

## Assign the task to the group manager. 

In [34]:
# "chat_result = lib_tool_executor.initiate_chat(
#     group_chat_manager,
#     max_round = 6,
#     message = "I have paper titled 'A biliary immune landscape map of primary sclerosing cholangitis reveals a dominant network of neutrophils and tissue-resident T cells'. Can you get all its citations, as a list DOIs. This ends the first sub-task, and then the 'moderator' should plan for the next sub-task: take the 10th citation (DOI), get its abstract and check if it is about PSC. Repeat this for the 11th and 23rd citation."
# )"