In [15]:
from tqdm import tqdm

In [1]:
from dotenv import load_dotenv
import os
from GrobidArticleExtractor import GrobidArticleExtractor
from langchain.prompts import PromptTemplate
from entity_extraction import ChatOpenRouter, parse_pdf, get_model_output, process_and_save_output, create_chain

In [2]:
load_dotenv()
OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"] 
GROBID_SERVER_URL = os.environ["GROBID_SERVER_URL"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [3]:
grobid_client = GrobidArticleExtractor(GROBID_SERVER_URL)


In [4]:
phillips_text = parse_pdf('data/Phillips_2023_11pg.pdf', grobid_client)


In [5]:
llm_gpt_4o_mini = ChatOpenRouter('openai/gpt-4o-mini')

In [11]:
llm_deepseek_v3 = ChatOpenRouter(model_name='deepseek/deepseek-chat-v3-0324:free')

In [6]:
ner_prompt = PromptTemplate.from_template("Task: Perform Named Entity Recognition (NER) with a deep understanding of neuroscience-specific terminology. Use precise and domain-relevant entity labels.\
Return the output in a training-ready JSON object for spaCy NER training, provide the entity,start position, end position, and respective label.\
Here is an example output: \
```json 'text': 'Drug addiction is a chronic','entities': ['entity': 'Drug addiction','start': 0,'end': 13,'label': 'DISORDER']```\
The text is provided here: {neuroscience_text}")

In [12]:
ner_chain = create_chain(llm_deepseek_v3, ner_prompt)

In [8]:
phillips_text.get('sections')

[{'heading': 'Introduction',
  'content': 'Drug addiction is a chronic, complex neuropsychiatric disorder characterized by a loss of control of drug-taking behaviors. The current drug addiction and overdose epidemic in the United States has been worsened by the COVID-19 pandemic as people struggle with social isolation and economic distress (Cisneros and Cunningham, 2021). Much of the current addiction epidemic and surge in overdose deaths has been attributed to the use of opioids and, in particular, synthetic opioids such as fentanyl. However, drug overdose deaths associated with psychostimulants rose 50  between 2019 and 2020 (Hedegaard et al., 2021). Furthermore, drug overdose deaths associated with cocaine have increased 3-fold since 1999 (Hedegaard et al., 2021;Ciccarone, 2021). While pharmacological treatments exist for opioid use disorder, no such treatments are available for stimulant use disorder. Thus, continued research on the molecular adaptations that occur following expos

In [16]:
all_output = []
sections = phillips_text.get('sections', [])

for section in tqdm(sections, desc="Running NER on sections"):
    chunk = section.get('content')
    output = ner_chain.invoke({"neuroscience_text": chunk})
    all_output.append(output)

Running NER on sections: 100%|██████████| 12/12 [12:36<00:00, 63.00s/it]


In [17]:
all_output

[AIMessage(content='Here is the spaCy NER training-ready JSON output with neuroscience-specific entity labels:\n\n```json\n{\n  "text": "Drug addiction is a chronic, complex neuropsychiatric disorder characterized by a loss of control of drug-taking behaviors. The current drug addiction and overdose epidemic in the United States has been worsened by the COVID-19 pandemic as people struggle with social isolation and economic distress (Cisneros and Cunningham, 2021). Much of the current addiction epidemic and surge in overdose deaths has been attributed to the use of opioids and, in particular, synthetic opioids such as fentanyl. However, drug overdose deaths associated with psychostimulants rose 50% between 2019 and 2020 (Hedegaard et al., 2021). Furthermore, drug overdose deaths associated with cocaine have increased 3-fold since 1999 (Hedegaard et al., 2021; Ciccarone, 2021). While pharmacological treatments exist for opioid use disorder, no such treatments are available for stimulant

In [28]:
import json
import re
from datetime import datetime

def process_and_save_output_multiple(output, file_prefix="output", prompt=None):
    """
    Process LLM output(s) to extract JSON, add model metadata and prompt once,
    and save to a single JSON file.

    Args:
        output: A single LLM output or a list of LLM outputs (e.g., AIMessage) each with `content` and `response_metadata`.
        file_prefix: Prefix for the output filename.
        prompt: Optional PromptTemplate object with template and input_variables.

    Returns:
        dict: A dictionary containing metadata and all processed outputs.
    """
    if not isinstance(output, list):
        output = [output]

    processed_outputs = []

    for index, item in enumerate(output):
        match = re.search(r"```(?:json)?\n(.*?)```", item.content, re.DOTALL)
        if match:
            json_str = match.group(1)
            try:
                json_obj = json.loads(json_str)
                processed_outputs.append(json_obj)
            except json.JSONDecodeError:
                print("Warning: Failed to decode JSON.")
        else:
            print("Warning: No JSON block found in one of the messages.")

    if processed_outputs:
        model_name = output[0].response_metadata.get("model_name", "unknown")
        if prompt:
            prompt_metadata = {
                "template": prompt.template,
                "input_variables": prompt.input_variables
            }
        result = {
            "model_name": model_name,
            "prompt": prompt_metadata,
            "output": processed_outputs
        }


        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        safe_model_name = model_name.replace('/', '_')
        filename = f'{file_prefix}_{safe_model_name}_{timestamp}.json'
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2)

        print(f"Saved {len(processed_outputs)} outputs to {filename}")
        return result
    else:
        print("No valid outputs to save.")
        return None



In [29]:
process_and_save_output_multiple(all_output, file_prefix = 'zero-shot-ner', prompt = ner_prompt)

Saved 8 outputs to zero-shot-ner_deepseek_deepseek-chat-v3-0324:free_20250502_170016.json


{'model_name': 'deepseek/deepseek-chat-v3-0324:free',
 'prompt': {'template': "Task: Perform Named Entity Recognition (NER) with a deep understanding of neuroscience-specific terminology. Use precise and domain-relevant entity labels.Return the output in a training-ready JSON object for spaCy NER training, provide the entity,start position, end position, and respective label.Here is an example output: ```json 'text': 'Drug addiction is a chronic','entities': ['entity': 'Drug addiction','start': 0,'end': 13,'label': 'DISORDER']```The text is provided here: {neuroscience_text}",
  'input_variables': ['neuroscience_text']},
 'output': [{'text': 'The integration of over 39,000 nuclei from rats that underwent different cocaine exposure paradigms identified two populations of Drd1-expressing MSNs (Drd1-MSN-1 and Drd1-MSN-2; Fig. 2a). To begin understanding the distinct gene expression signatures of these Drd1-MSN populations, we conducted a differential expression analysis using pseudobulked