In [1]:
import os
from litellm import completion
from dotenv import load_dotenv
from tqdm import tqdm
import textwrap
from GrobidArticleExtractor import GrobidArticleExtractor
from langchain.prompts import PromptTemplate
from entity_extraction import ChatOpenRouter, parse_pdf, get_model_output, process_and_save_output, create_chain, run_chain_on_small_chunks, process_and_save_output_multiple

In [2]:
load_dotenv()
OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"] 
GROBID_SERVER_URL = os.environ["GROBID_SERVER_URL"]

In [3]:
grobid_client = GrobidArticleExtractor(GROBID_SERVER_URL)


In [4]:
lin_text = parse_pdf('data/Lin_2023_12pg.pdf', grobid_client)
phillips_text = parse_pdf('data/Phillips_2023_11pg.pdf', grobid_client)

In [25]:
test_input_text = phillips_text.get("sections")[0].get("content")[0]

In [100]:
test_input_text

'Drug addiction is a chronic, complex neuropsychiatric disorder characterized by a loss of control of drug-taking behaviors. The current drug addiction and overdose epidemic in the United States has been worsened by the COVID-19 pandemic as people struggle with social isolation and economic distress (Cisneros and Cunningham, 2021). Much of the current addiction epidemic and surge in overdose deaths has been attributed to the use of opioids and, in particular, synthetic opioids such as fentanyl. However, drug overdose deaths associated with psychostimulants rose 50  between 2019 and 2020 (Hedegaard et al., 2021). Furthermore, drug overdose deaths associated with cocaine have increased 3-fold since 1999 (Hedegaard et al., 2021;Ciccarone, 2021). While pharmacological treatments exist for opioid use disorder, no such treatments are available for stimulant use disorder. Thus, continued research on the molecular adaptations that occur following exposure to cocaine, and the cell types that ma

In [80]:
# in_place_tagging_prompt = PromptTemplate(
#     input_variables=["neuroscience_text"],
#     template=textwrap.dedent("""\
#     **Background**:
#     You are a highly skilled neuroscience expert with extensive experience in named entity recognition and research publication entity annotation.

#     **Task**:
#     Given the input text, identify **all** relevant entities and insert each directly into the text in the format:
#     (entity)[LABEL]
#     Where LABEL is an UPPERCASE, underscore‑style category.  
#     The list of entities extracted should be exhaustive and comprehensive. You should identify and label all entities, including duplicates.                                                             
#     Please ensure that all entities are accurately identified and classified according to their role within the neuroscience domain. 
#     Do **not** alter any characters in the original text other than adding these annotations.
    
#     **Output**:
#     Return only the full text with in‑place annotations.  
    
#     **Input Text**:
#     {neuroscience_text}
#     """
#     )
# )

in_place_tagging_prompt = PromptTemplate(
    input_variables=["neuroscience_text"],
    template=textwrap.dedent("""\
    **Background**
    You are a highly skilled neuroscience expert with extensive experience in named entity recognition and research publication entity annotation.
    
    **Task**
    Meticulously extract and classify all relevant entities into the most appropriate and specific category from the provided neuroscience text. 
    The list of entities extracted should be exhaustive and comprehensive. You should identify and label all entities, including duplicates.                                                             
    Please ensure that all entities are accurately identified and classified according to their role within the neuroscience domain. 
    IMPORTANT: DO NOT modify the input text in any way. Use the text as is, without any alterations or corrections. 
    
    **Output Format**
    Return only the full text with in‑place annotations.
    The format for the annotations is as follows:
    (entity)[LABEL]
    Where entity is the extracted entity (not altered or modified), and LABEL is an UPPERCASE, underscore‑style category.
    All extracted entities should be in parentheses, followed by the label in square brackets.
    
    **Example**
    Input Text: "Histamine is a conserved neuromodulator"
    Output: "(Histamine)[AMINE] is a conserved (neuromodulator)[HORMONE]"
                                 
    **Input Text**:
    {neuroscience_text}
    """
    )
)
                             

In [97]:
location_extraction_prompt = PromptTemplate(
    input_variables=["raw_text", "annotated_text"],
    template=textwrap.dedent("""\
    **Background**:
    The user needs a JSON output with entities, labels, and positions from a text. To do this, I’ll search for occurrences of each entity, compute their start and end character positions in the text, and then output them as JSON objects with "entity," "label," "start," and "end." They want duplicates preserved. I'll work off the list of entities provided, but I’ll avoid any new labels unless they were explicitly mentioned. Let’s make sure the positions are accurate and get the right JSON format!
    
    To handle this, I need to extract the entity names and their start and end indices from the provided text. I'll use the exact text string and manually define a list of entities. Using regular expressions with word boundaries will help in accurately finding all occurrences, including duplicates. I’ll define the labels too, and then compute the positions with code, ensuring the correct start and end indices for each entity. Let’s implement this using Python for accuracy!
    **Goal**:
    For each extracted entity, find its exact character positions in the original text.
    **Task:**
    From the annotated text with in‑text annotations, produce a JSON object containing:
    1. "text": raw text  
    2. "entities": an array where each object has:
    • "entity": exact substring  
    • "label": the UPPERCASE category  
    • "start": start character index in the raw text  
    • "end": end character index in the raw text  

    You can use the raw text to help cross reference the start and end indices. Do not modify the raw text only compute start/end based on it.
    
                                                     
    **Output**:
    Return the result in the following JSON format:
    ```json
    {{
      "text": "<original_input_text>",
      "entities": [
        {{
          "entity": "<entity>",
          "start": <start_index>,
          "end": <end_index>,
          "label": "<ENTITY_TYPE",
          
        }},
        ...
      ]
    }}
    ```
    **Example**
    Raw Text: "Histamine is a conserved neuromodulator"
    Annotated Text: "(Histamine)[AMINE] is a conserved (neuromodulator)[HORMONE]"
    Output: 
    ```json
    {{
      "text": "Histamine is a conserved neuromodulator",
      "entities": [
        {{
          "entity": "Histamine",
          "start": 0,
          "end": 8,
          "label": "AMINE"
        }},
        {{
          "entity": "neuromodulator",
          "start": 25,
          "end": 38,
          "label": "HORMONE"
        }}
      ]
    }}
    ```
    **Raw Text**:
    {raw_text}
    **Annotated Text**:
    {annotated_text}
    """
    )
)

In [55]:
llm_gpt_4o_mini = ChatOpenRouter(model_name='openai/gpt-4o-mini')

In [98]:
in_place_tagging_chain = create_chain(llm_gpt_4o_mini, in_place_tagging_prompt)
location_extraction_chain = create_chain(llm_gpt_4o_mini, location_extraction_prompt)


In [82]:
test_phillips_output = in_place_tagging_chain.invoke(test_input_text)

In [83]:
annotated_text = test_phillips_output.content
annotated_text

'(Drug addiction)[DISORDER] is a chronic, complex (neuropsychiatric disorder)[DISORDER] characterized by a loss of control of (drug-taking behaviors)[BEHAVIOR]. The current (drug addiction)[DISORDER] and overdose epidemic in the (United States)[LOCATION] has been worsened by the (COVID-19 pandemic)[EVENT] as people struggle with (social isolation)[SOCIAL_CONDITION] and (economic distress)[ECONOMIC_CONDITION] (Cisneros and Cunningham, 2021). Much of the current (addiction epidemic)[EVENT] and surge in (overdose deaths)[EVENT] has been attributed to the use of (opioids)[DRUG] and, in particular, synthetic (opioids)[DRUG] such as (fentanyl)[DRUG]. However, (drug overdose deaths)[EVENT] associated with (psychostimulants)[DRUG] rose 50 between 2019 and 2020 (Hedegaard et al., 2021). Furthermore, (drug overdose deaths)[EVENT] associated with (cocaine)[DRUG] have increased 3-fold since 1999 (Hedegaard et al., 2021;Ciccarone, 2021). While (pharmacological treatments)[TREATMENT] exist for (opio

In [99]:
json_output = location_extraction_chain.invoke({"raw_text": test_input_text, "annotated_text": annotated_text})

In [None]:
json_output

'```json\n{\n  "text": "Drug addiction is a chronic, complex neuropsychiatric disorder characterized by a loss of control of drug-taking behaviors. The current drug addiction and overdose epidemic in the United States has been worsened by the COVID-19 pandemic as people struggle with social isolation and economic distress (Cisneros and Cunningham, 2021). Much of the current addiction epidemic and surge in overdose deaths has been attributed to the use of opioids and, in particular, synthetic opioids such as fentanyl. However, drug overdose deaths associated with psychostimulants rose 50 between 2019 and 2020 (Hedegaard et al., 2021). Furthermore, drug overdose deaths associated with cocaine have increased 3-fold since 1999 (Hedegaard et al., 2021;Ciccarone, 2021). While pharmacological treatments exist for opioid use disorder, no such treatments are available for stimulant use disorder. Thus, continued research on the molecular adaptations that occur following exposure to cocaine, and 

In [94]:
import json
import re
def process_and_save_output(output, file_prefix="output", prompt=None):
    """
    Process LLM output to extract JSON, add model metadata, and save to file.
    
    Args:
        output: LLM output object containing content and response_metadata
        file_prefix: Prefix for the output filename (default: "extracted_entities")
    
    Returns:
        dict: Processed JSON object if successful, None otherwise
    """
    match = re.search(r"```(?:json)?\n(.*?)```", output.content, re.DOTALL)
    if match:
        json_str = match.group(1)
        json_obj = json.loads(json_str)
        # Add model name to JSON object
        json_obj["model_name"] = output.response_metadata["model_name"]
        if prompt:
            prompt_dict = {
            "template": prompt.template,
            "input_variables": prompt.input_variables
            }
            json_obj["prompt"] = prompt_dict
        print(json.dumps(json_obj, indent=2))
        # # Save the JSON object to a file
        # timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        # model_name = output.response_metadata["model_name"].replace('/', '_')
        # filename = f'{file_prefix}_{model_name}_{timestamp}.json'
        # with open(filename, 'w', encoding='utf-8') as f:
        #     json.dump(json_obj, f, indent=2)
        #return json_obj
    return None

In [95]:
process_and_save_output(json_output, file_prefix="output", prompt=location_extraction_prompt)

{
  "text": "Drug addiction is a chronic, complex neuropsychiatric disorder characterized by a loss of control of drug-taking behaviors. The current drug addiction and overdose epidemic in the United States has been worsened by the COVID-19 pandemic as people struggle with social isolation and economic distress (Cisneros and Cunningham, 2021). Much of the current addiction epidemic and surge in overdose deaths has been attributed to the use of opioids and, in particular, synthetic opioids such as fentanyl. However, drug overdose deaths associated with psychostimulants rose 50 between 2019 and 2020 (Hedegaard et al., 2021). Furthermore, drug overdose deaths associated with cocaine have increased 3-fold since 1999 (Hedegaard et al., 2021;Ciccarone, 2021). While pharmacological treatments exist for opioid use disorder, no such treatments are available for stimulant use disorder. Thus, continued research on the molecular adaptations that occur following exposure to cocaine, and the cell ty