# LLM agents for literature review
Get information about scientific papers and their citations, and check if they are relevant to the PSC disease

## Import libaries and prepare the API credential

In [141]:
# pip install Biopython

In [142]:
# pip install pyautogen

In [143]:
# pip install neo4j


> Feature 1 Snowballing

In [144]:

import requests
from Bio import Entrez
from typing import List

from autogen import ConversableAgent, register_function, GroupChatManager, GroupChat
from autogen.coding import LocalCommandLineCodeExecutor


from time import sleep

with open('openai.credential', 'r') as file:
    key = file.read()


MODEL = 'gpt-4o'

# Decide if there is human interaction or not
DEBUG_MODE = False


## A nested tool agents to retrieve paper info

> 1.1 retrieve paper info

### Python functions 

A set of functions to get citations and abstracts of a paper, search for a paper from its title, and covert between DOIs and PubMed IDs.


In [145]:
### Feature 1.1.1 retrieve citations
def get_citations(doi:str) -> str:
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        references = data.get("message", {}).get("reference", [])
        reference_string = ", ".join([ref.get('DOI', 'Unknown DOI') for ref in references])

        return "Paper {} cites the following papers: {}".format(doi, reference_string)
    else:
        print(f"Error: {response.status_code}")
        return ""

# > 1.1.9 retrieve a subset of the citations
def get_subset_of_citiations(doi:str, start:int, end:int) -> str:
    url = f"https://api.crossref.org/works/{doi}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        references = data.get("message", {}).get("reference", [])
        references = references[start:end]
        reference_string = ", ".join([ref.get('DOI', 'Unknown DOI') for ref in references])

        return "Paper {} cites the following papers: {}".format(doi, reference_string)
    else:
        print(f"Error: {response.status_code}")
        return ""        

In [146]:
def remove_invalid_words(text:str)->str:
    newstr = text.replace("'", " ")
    return newstr.replace("terminate", "terminatte")

### Feature 1.1.2: get title and abstract from PubMed
def get_title_abstract(pmid: str) -> str:

    Entrez.email = 'hui.song@sintef.no' 
    print("found pmid---->", pmid)
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
    records = Entrez.read(handle)
    handle.close()
    print("find item")
    # Step 3: Parse the XML to extract title and abstract
    article = records['PubmedArticle'][0]['MedlineCitation']['Article']
    
    # Extract the title
    title = article.get('ArticleTitle', 'No title available.')
    
    # Extract the abstract
    abstract_paragraphs = article.get('Abstract', {}).get('AbstractText', [])
    
    if isinstance(abstract_paragraphs, list):
        abstract = ' '.join(str(para) for para in abstract_paragraphs)
    else:
        abstract = str(abstract_paragraphs)
    
    return "Title: {} \nAbstract: {}".format(
        remove_invalid_words(title), 
        remove_invalid_words(abstract)
    )
    

In [147]:
### 1.1.3 Get PMID from DOI
def get_pmid_from_doi(doi: str)->str:

    Entrez.email = 'hui.song@sintef.no'  

    handle = Entrez.esearch(db='pubmed', term='"{}" [DOI]'.format(doi))
    record = Entrez.read(handle)
    handle.close()

    pmid_list = record.get('IdList', [])

    if pmid_list:
        return repr(pmid_list[0])
    else:
        return 'No PMID found for DOI: {}'.format(doi)
    
### 1.1.4 get PMID from title
def get_pmid_from_title(title:str)->str:
    """
    Searches PubMed for a paper by title and returns the PMID.

    Parameters:
    title (str): The title of the paper.

    Returns:
    str: The PMID of the paper if found.
    """
    # Set your email (required by NCBI)
    Entrez.email = "hui.song@sintef.no"  # Replace with your email address

    # Search for the paper using its title
    search_handle = Entrez.esearch(db="pubmed", term=title, retmode="xml")
    search_results = Entrez.read(search_handle)
    search_handle.close()

    # Get the list of PubMed IDs
    id_list = search_results.get("IdList", [])

    if not id_list:
        print("No articles found with the given title.")
        return ""
    else:
        # Return the first PMID (assuming it's the most relevant)
        return id_list[0]

### 1.1.5 get DOI form PMID
def get_doi_from_pmid(pmid:str)->str:
    """
    Fetches the DOI of a paper given its PMID.

    Parameters:
    pmid (str): The PubMed ID of the paper.

    Returns:
    str or None: The DOI of the paper if found, else None.
    """
    # Set your email (required by NCBI)
    Entrez.email = "hui.song@sintef.no"  # Replace with your email address

    # Fetch the article details
    fetch_handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    fetch_results = Entrez.read(fetch_handle)
    fetch_handle.close()

    # Extract the DOI from the article data
    try:
        article = fetch_results['PubmedArticle'][0]
        article_ids = article['PubmedData']['ArticleIdList']

        for id_element in article_ids:
            if id_element.attributes.get('IdType') == 'doi':
                return str(id_element)
        print("DOI not found in the article data.")
        return ""
    except (IndexError, KeyError) as e:
        print(f"Error retrieving DOI: {e}")
        return ""
    
### This function has some problem and are currently not used ###
def get_citations_pubmed(pmid:str)->str:
    Entrez.email = "hui.song@sintef.no"
    links = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pubmed_refs")
    record = Entrez.read(links)
	
    records = record[0][u'LinkSetDb'][0][u'Link']
    link_list = []
    for link in records:
        link_list.append(link[u'Id'])

    print(link_list)
    for index, id in enumerate(link_list):
        print(id)
        handle = Entrez.efetch(db='pubmed', id=id, retmode='xml')
        records = Entrez.read(handle)
        handle.close()
    
    # Step 3: Parse the XML to extract title and abstract
        article = records['PubmedArticle'][0]['MedlineCitation']['Article']
    
    # Extract the title
        title = article.get('ArticleTitle', 'No title available.')
        print(index, title)

### The following two functions are not necessary (the agents can figure out by itself how to combine the functions), but it will help save some tokens.
### 1.1.6 get title and abstract from DOI
def get_title_abstract_from_doi(doi:str)->str:
    for i in (0,4):
        try:
            pmid = get_pmid_from_doi(doi)
            print("pmid------>", pmid)
            sleep(2)
            return "DOI: {}\n {}".format(doi, get_title_abstract(pmid))
        except Exception as err:
            print(err)
            sleep(3)
            continue

    return "could not find the paper from DOI: {}".format(doi)

### 1.1.7 Get DOI from title
def get_doi_from_title(title:str)->str:
    pmid = get_pmid_from_title(title)
    return get_doi_from_pmid(pmid)

### Tool executor (just to wrap up the function)

In [148]:
### 1.1.1 get paper info from external tools
lib_tool_executor = ConversableAgent(
    "lib_tool_executor",
    llm_config=False,  # Turn off LLM for this agent.
    code_execution_config=False,
    human_input_mode= "ALWAYS" if DEBUG_MODE else "NEVER",
    is_termination_msg=lambda msg: (msg["content"]) and ("terminate" in msg["content"].lower())
)


### Tool driver (caller of the function)

In [149]:
# 1.1 Get paper info
lib_tool_driver = ConversableAgent(
    "lib_tool_driver",
    system_message = "You are a helpful assistant to retrieve information of scientific papers, using some external tools search papers from their titles, get their citations, titles, abstracts, etc. The papers are in most cases identified by their DOIs"
    "Once you received the result you needed from the tools, or you believe the result from the tools satisfy the task you were assigned, say 'TERMINATE' immediately. Do not try to repeat, summarize or analyze the results. Also, try to do one task each time. Once a task or subtask assigned to you was done, say 'TERMINATE', rather than jumpting into the next task immediately",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    human_input_mode= "ALWAYS" if DEBUG_MODE else "NEVER",
    code_execution_config=False
)

### Register the tool to both caller and executor


In [150]:

register_function(
    get_title_abstract,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Get the title and abstract of a paper. The input is the PMID (pubmed id) of this paper"
)

# register_function(
#     get_pmid_from_doi,
#     caller = lib_tool_driver,
#     executor = lib_tool_executor,
#     description = "Get the PMID (pubmed id) of a paper from its DOI as input"
# )

# register_function(
#     get_doi_from_pmid,
#     caller = lib_tool_driver,
#     executor = lib_tool_executor,
#     description = "Get the DOI of a paper from its PMID (pubmed id) as input"
# )

register_function(
    get_pmid_from_title,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Search a paper from its title. The input is the title, and the output is the PMID of this paper"
)

register_function(
    get_citations,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Get the citations of a paper. The input is the DOI of the source paper, and the output is a list of DOIs of the papers that the source paper has cited"
)

register_function(
    get_subset_of_citiations,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Get the citations of a paper, but only return a subset of the full citation list. The input is the DOI of the source paper, the starting index of the subset, and the ending index of the subset, and the output is a list of DOIs of the papers that the source paper has cited"
)

register_function(
    get_doi_from_title,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Search a paper from its title. The input is the title and the putput is the DOI of the found paper"
)

register_function(
    get_title_abstract_from_doi,
    caller = lib_tool_driver,
    executor = lib_tool_executor,
    description = "Get the title and abstract of a paper. The input is the DOI of this paper"
)


### Make the nested agent for tools. 
Now lib_tool_executor is the only one exposed to the group

In [151]:
def second_last_msg(sender: ConversableAgent, recipient: ConversableAgent, summary_args: dict):
    return sender.chat_messages[recipient][-2]["content"]

nested_chats = [
    {
        "recipient": lib_tool_driver,
        "max_turns": 4,
        "summary_method": second_last_msg
    }
]

### 1.1 Get paper info
lib_tool_executor.register_nested_chats(
    nested_chats, 
    trigger = lambda sender: sender not in [lib_tool_driver]
)

## Agent to the read the abstract and check if the paper is about PSC or not.

> 1.2 Review paper

In [152]:
chatbot_system_message = "You are a helpful assistant. You will receive the abstract of a scientific paper, and you judge if this paper is about a research for the PSC (Primary Sclerosing Cholangitis) disease. Please answer with three lines: 1) 'My review for paper <it's DOI>:'; 2) simply 'YES' or 'No'; 3) If it is, please describe briefly what the research is about (in on sentence), If not, please give a short explanation why you don't think it is."

psc_checker = ConversableAgent(
    "psc_checker",
    system_message = chatbot_system_message,
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    code_execution_config=False,  # Turn off code execution for this agent.
)

## Note taking agents
Tools to save key information into a knowledge graph, and query the graph for saved information.

> 1.3 Store paper info

> 1.3.1 Save and get data from Neo4j

In [153]:
from neo4j import GraphDatabase

gdb = GraphDatabase.driver('neo4j://neo4j', auth = ("neo4j", "llm-agents"))
gdb.verify_connectivity()

# 1.3.1.1 Execute Neo4j queries
def query_neo4j(query:str)->str:

    try:
        records, summary, keys = gdb.execute_query(query)
        return repr(records)
    except Exception as e:
        return repr(e) 
    

graph_tool_executor = ConversableAgent(
    "graph_tool_executor",
    llm_config=False,  # Turn off LLM for this agent.
    code_execution_config=False,
    human_input_mode= "ALWAYS" if DEBUG_MODE else "NEVER",
    is_termination_msg=lambda msg: (msg["content"]) and ("terminate" in msg["content"].lower())
)

graph_tool_driver = ConversableAgent(
    "graph_tool_driver",
    system_message = "You are taking notes about scientific paper mentiond by others."
        "Each paper is identified by DOI, and may have title and abstract."
        "You have a tool helping you saving paper information to a Neo4j graph, and to query informaiton you saved"
        "When you receive a paper, with DOI, title or abstract, save them."
        "Once it is succesfully save, say TERMINATE to avoid the tool executing unnecessary tasks"
        "When you were asked about papers you saved, query the Neo4j graph via the tool."
        "Once you received the satisfied results from the query, summarize them in concise text,"
        "followed by 'TERMINATE' to stop the tool from querying more."
        "If you see you cannot proceed with the assigned task due to the lack of tools, also say 'TERMINATE' to stop the task",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    code_execution_config=False,
    human_input_mode= "ALWAYS" if DEBUG_MODE else "NEVER"
)

register_function(
    query_neo4j,
    caller = graph_tool_driver,
    executor = graph_tool_executor,
    description = "Query or modify the neo4j graph database. The input is a cypher query, and the output is a list of records returned from the query. The graph has the following schema: There are only two labels for nodes: 'Paper' and 'Review'. "
        "A Paper has properties: doi (its id), title, abstract"
        "A Review has properties: isPsc (YES, NO), reviewdBy, and comment"
        "Two labels for relations: 'Cites' (a paper cites another paper), and 'ReviewOf' (a Review is a review of a Paper)"
        "Each node or relation should have one and only one label"
        "The property existence syntax `exists(variable.property)` is no longer supported by new Cypher. Please use `variable.property IS NOT NULL` ."
        "Remember that DOI is the id of Paper. When you search or match a paper, try to use DOI as long as it is available."
        "When you are asked to create many nodes (e.g., to create many papers using their DOIs), try to use the batch and UNWIND keyword in Cypher to do creation/merging in one Cypher query, instead of doing them one by one"
)

nested_chats = [
    {
        "recipient": graph_tool_driver,
        "max_turns": 4,
        "summary_method": "last_msg"
    }
]

graph_tool_executor.register_nested_chats(
    nested_chats, 
    trigger = lambda sender: sender not in [graph_tool_driver]
)

### Human interface agent

> Feature 3: a human chat interface

In [154]:
human_proxy = ConversableAgent(
    "human_proxy",
    llm_config=False,  # Turn off LLM for this agent.
    code_execution_config=False,
    human_input_mode= "ALWAYS"
)

## Create the group of agents to conduct the task

### A moderator agent to plan and guide the group chat

> 1.4 Planning for complex tasks

In [155]:
moderator = ConversableAgent(
    "moderator",
    system_message = "You are an AI-based moderator that makes plans for the whole group."
    "When you get a task, break down it into sub tasks, and each can be performed by one of your 'partner agents'."
    "You will get an introduction about what each of your partner agents can do."
    "If you speak in the middle of two tasks, remember to repeat the key information you get from the previous speaker, so that the next speaker has sufficent context",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]},
    code_execution_config=False,  # Turn off code execution for this agent.
    human_input_mode = "ALWAYS" if DEBUG_MODE else "NEVER"
)


### Create the group chat and assign it to a group manager

> 1.4 Handle complex tasks automatically

In [156]:
lib_tool_executor.description = "I have a couple of tools to fetch paper informaiton from PubMed. I can search the DOI or PubMed Id from title, get the abstract of a paper, and find the citations of the paper"
psc_checker.description = "I can read a paragraph of English text (such as the abstract of a paper), and try to tell if the paper is about the research on the PSC disease"
moderator.description = "I am the moderator, and I break down a task into subtasks. I should always be the one to speak first, to make plan for sub-tasks"
graph_tool_executor.description = "I am the note-taker. I record the papers, citations and reviews mentioned by other speakers in a database, and help you query from what I noted."


group_chat = GroupChat(
    agents = [lib_tool_executor, graph_tool_executor, moderator],
    messages = [],
    max_round = 20,
    send_introductions = True,
    allow_repeat_speaker = False
)

group_chat_manager = GroupChatManager(
    groupchat = group_chat,
    system_message = "You are an assistent to manage who speak next. ",
        # "Remember that once a paper is mentioned, or its abstracts or citations, or the review of a paper, is posted in the graph chat, you should ask the note-taker to speak next, so that it records the paper in the database",
    llm_config = {"config_list": [{"model": MODEL, "api_key": key}]}
)



## Initialize the knowledge graph with on sample paper

> 1.3 Add seed paper

In [157]:
def initialize():

    chat_result = lib_tool_executor.initiate_chat(
        group_chat_manager,
        max_round = 6,
        message = "Let's start with paper 10.1126/scitranslmed.abb3107. Please get its title and abstract. After that, save the following review to the existing paper with DOI 10.1126/scitranslmed.abb3107: Xiaojun has reviewed the paper (DOI: 10.1126/scitranslmed.abb3107) and judges that is a paper about PSC, and she commented that it is one of the primary work in this field. For this paper, no need to ask an additional opinion on whether it is about PSC or not."
    )

    

## Expand a YES paper with its references

> 1.5 expand paper

In [158]:
def expand_references():
    chat_result = lib_tool_executor.initiate_chat(
        group_chat_manager,
        max_round = 1,
        # 1.5.1: query from note
        # 1.5.2: retrieve references
        # 1.5.3: save the references
        message = "Your task is as follows: "
            "1. Please query from the notes (the knowledge graph) to find one paper"
            "which has been Reviewed as YES for PSC research and does not cite any other paper, and get its DOI."
            "2. If you find such as paper, retrieve all its references, as a list of DOIs. This should be done using the lib tool to access an online library, not from the graph. No need to get its title and abstract at this point."
            "3. Pass the list of references to the graph tool. For each DOI in the citation list, the graph tool should create a Paper node in the knowledge graph, "
            "with only the property of DOI, and create a 'Cites' relation from the original paper (matched by its DOI) to the newly created paper. "
            "Since there will normally be many citations, "
            "please use a batch of data and the UNWIND keyword to create/merge these nodes in one cypher query."
            "once the references are saved to the graph, or if you receive no result from the graph tool, stop the conversation"
            
    )

## Refine and review a paper

> 1.6 refine the paper 
> 1.7 review the paper

In [159]:
# def refine_paper_old():
#     chat_result = lib_tool_executor.initiate_chat(
#         group_chat_manager,
#         max_round = 6,
#         message = "Your task is as follows: "
#             "1. Query your note (the graph) to find one paper which does not include abstract. Stop here."
#             "2. Use the lib tool to retrieve the title and abstract of this paper from online libraries."
#             "3. Move back to the note-taking tool, and ask it to save the title and abstract into the existing Paper node identified by the DOI."
#     )


def refine_paper():
    chat_results = moderator.initiate_chats(
    [
        # 1.6.1: find paper without abstract
        {
            "recipient": graph_tool_executor,
            "message": "find one paper which does not have an abstract. return its DOI",
            "max_turns": 1,
            "summary_method": "last_msg"
        }, # 1.6.2: get the abstract
        {
            "recipient": lib_tool_executor,
            "message": "obtain the title and abstract of the paper with DOI",
            "max_turns": 1,
            "summary_method": "last_msg"
        }, # 1.6.3: update the db
        {
            "recipient": graph_tool_executor,
            "message": "Update the node of this paper identified by DOI with the title and abstract. If you didn't find valid title or abstract, set the properties to 'no title' and 'no abstract', respectively. If you have problem modifying the graph, set the abstract to 'error - skip' ",
            "max_turns": 1,
            "summary_method": "last_msg"
        }
    ]
)

def review_paper():
    chat_results = moderator.initiate_chats(
    [
        # 1.7.1: get paper without review
        {
            "recipient": graph_tool_executor,
            "message": "find one paper which has an abstract, the size of its abstract is larger than 20, but it does not have any review. Repeat the DOI, Title and abstract of the found paper",
            "max_turns": 1,
            "summary_method": "last_msg"
        }, # 1.7.8: review
        {
            "recipient": psc_checker,
            "message": "Check if this paper is about PSC",
            "max_turns": 1,
            "summary_method": "last_msg"
        }, # 1.7.9: save result
        {
            "recipient": graph_tool_executor,
            "message": "Save the Review of this paper into the graph database. Remember to match the paper only using its DOI, not using title or abstract",
            "max_turns": 1,
            "summary_method": "last_msg"
        }
    ])

# > Feature 2: retrieve all the reviews    
def print_out_reviews():
    chat_results = moderator.initiate_chat(
        graph_tool_executor,
        max_round = 2,
        message = "Please find me all the pairs of Review->Paper, and for each pair, print out the title of the paper, the first 50 charactors of the abstract (if not null), if it is yes or no for PSC"
    )

# > Feature 1.8: remove too short abstracts.
def remove_invalid_abstract():
    chat_results = moderator.initiate_chat(
        graph_tool_executor,
        max_round = 2,
        message = "Please find the papers that has an abstract with less than 20 characters, and remove the their abstract properties. Remember that in CYPHER, size() can be used to count the number of characters of a string; and use 'remove' instead of 'delete' to remove properties."
    )

# >Feature 1.9: print references from DOI
def print_references(doi:str)->str:
    lib_tool_executor.initiate_chat(
        lib_tool_driver,
        message = "please get the references of paper: {}".format(doi),
        max_round = 2
    )

# >Feature 1.10: add reference manually
def add_references():
   chat_results = moderator.initiate_chats(
    [
        # 1.7.1: get paper without review
        {
            "recipient": human_proxy,
            "message": "please give me the list of references",
            "max_turns": 1,
            "summary_method": "last_msg"
        },
        {
            "recipient": graph_tool_executor,
            "message": "you receive a list of DOIs that were references of one paper." 
            "Please save each DOI in the reference list as a paper, and a 'Cites' link from the original paper to this reference paper",
            "max_turns": 1,
            "summary_method": "last_msg"
        }
    ])

## Feature 5: Chatbot for reading the paper graph

In [None]:
def start_chat():
    chat_result = graph_tool_executor.initiate_chat(
        human_proxy,
        message="Tell me what you want to know about the data graph?"
    )

## The "main" entry point of function calls.

> Feature 1: Snowballing

In [None]:

# expand_references()

for i in range(0,50):
    refine_paper()

# remove_invalid_abstract();    
for i in range(0,40):
    review_paper()


#> Feature 3: get reviews
# print_out_reviews()  


#> Feature 4: add references manually for too large lists
# print_references("doi: 10.1016/j.jhep.2017.07.022")
# add_references()

# expand_references()
# refine_paper()
# review_paper()


[34m
********************************************************************************[0m
[34mStarting a new chat....[0m
[34m
********************************************************************************[0m
[33mmoderator[0m (to graph_tool_executor):

find one paper which does not have an abstract. return its DOI

--------------------------------------------------------------------------------
[34m
********************************************************************************[0m
[34mStarting a new chat....[0m
[34m
********************************************************************************[0m
[33mgraph_tool_executor[0m (to graph_tool_driver):

find one paper which does not have an abstract. return its DOI

--------------------------------------------------------------------------------
[33mgraph_tool_driver[0m (to graph_tool_executor):

[32m***** Suggested tool call (call_GP9iGw4fnDzXWZ1JIRRY6c4y): query_neo4j *****[0m
Arguments: 
{"query":"MATCH (p:Paper) WH