In [None]:
# python-dotenv -> load environment variables from .env file
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai 
!pip3 install --upgrade --quiet python-dotenv
!pip3 install --upgrade --quiet grobidarticleextractor

In [1]:
from dotenv import load_dotenv
from tqdm import tqdm
import os
from GrobidArticleExtractor import GrobidArticleExtractor
from langchain.prompts import PromptTemplate
from entity_extraction import ChatOpenRouter, parse_pdf, get_model_output, process_and_save_output, create_chain

In [2]:
load_dotenv()
OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"] 
GROBID_SERVER_URL = os.environ["GROBID_SERVER_URL"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [3]:
grobid_client = GrobidArticleExtractor(GROBID_SERVER_URL)


In [4]:
phillips_text = parse_pdf('data/Phillips_2023_11pg.pdf', grobid_client)


In [5]:
llm_gpt_4o_mini = ChatOpenRouter('openai/gpt-4o-mini')

In [6]:
llm_deepseek_v3 = ChatOpenRouter(model_name='deepseek/deepseek-chat-v3-0324:free')

In [7]:
ner_prompt = PromptTemplate.from_template("Task: Perform Named Entity Recognition (NER) with a deep understanding of neuroscience-specific terminology. Use precise and domain-relevant entity labels.\
Return the output in a training-ready JSON object for spaCy NER training, provide the entity,start position, end position, and respective label.\
Here is an example output: \
```json 'text': 'Drug addiction is a chronic','entities': ['entity': 'Drug addiction','start': 0,'end': 13,'label': 'DISORDER']```\
The text is provided here: {neuroscience_text}")

ner_chain = create_chain(llm_deepseek_v3, ner_prompt)

In [10]:
all_output = []
sections = phillips_text.get('sections', [])

for section in tqdm(sections, desc="Running NER on sections"):
    for chunk in tqdm(section.get('content', []), desc="Processing chunks in content"):
        output = ner_chain.invoke({"neuroscience_text": chunk})
        all_output.append(output)
    for chunk in section.get('subsections', []):
        output = ner_chain.invoke({"neuroscience_text": chunk})
        all_output.append(output)

Processing chunks in content: 100%|██████████| 4/4 [04:04<00:00, 61.08s/it]
Processing chunks in content: 100%|██████████| 1/1 [01:48<00:00, 108.20s/it]
Processing chunks in content: 100%|██████████| 3/3 [04:06<00:00, 82.31s/it]
Processing chunks in content: 100%|██████████| 1/1 [00:31<00:00, 31.73s/it]
Processing chunks in content: 100%|██████████| 3/3 [01:57<00:00, 39.14s/it]
Processing chunks in content: 100%|██████████| 2/2 [02:13<00:00, 66.76s/it]
Processing chunks in content: 100%|██████████| 8/8 [03:51<00:00, 28.99s/it]
Processing chunks in content: 100%|██████████| 1/1 [00:21<00:00, 21.31s/it]
Processing chunks in content: 100%|██████████| 1/1 [00:20<00:00, 20.14s/it]
Processing chunks in content: 100%|██████████| 1/1 [00:57<00:00, 57.80s/it]
Processing chunks in content: 100%|██████████| 1/1 [00:17<00:00, 17.51s/it]
Processing chunks in content: 100%|██████████| 1/1 [00:53<00:00, 53.88s/it]
Running NER on sections: 100%|██████████| 12/12 [21:24<00:00, 107.06s/it]


In [13]:
import json
import re
from datetime import datetime

def process_and_save_output_multiple(output, file_prefix="output", prompt=None):
    """
    Process LLM output(s) to extract JSON, add model metadata and prompt once,
    and save to a single JSON file.

    Args:
        output: A single LLM output or a list of LLM outputs (e.g., AIMessage) each with `content` and `response_metadata`.
        file_prefix: Prefix for the output filename.
        prompt: Optional PromptTemplate object with template and input_variables.

    Returns:
        dict: A dictionary containing metadata and all processed outputs.
    """
    if not isinstance(output, list):
        output = [output]

    processed_outputs = []

    for index, item in enumerate(output):
        match = re.search(r"```(?:json)?\n(.*?)```", item.content, re.DOTALL)
        if match:
            json_str = match.group(1)
            try:
                json_obj = json.loads(json_str)
                processed_outputs.append(json_obj)
            except json.JSONDecodeError:
                print("Warning: Failed to decode JSON.")
        else:
            print(f"Warning: No JSON block found in section {index}.")

    if processed_outputs:
        model_name = output[0].response_metadata.get("model_name", "unknown")
        if prompt:
            prompt_metadata = {
                "template": prompt.template,
                "input_variables": prompt.input_variables
            }
        result = {
            "model_name": model_name,
            "prompt": prompt_metadata,
            "output": processed_outputs
        }


        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        safe_model_name = model_name.replace('/', '_')
        filename = f'{file_prefix}_{safe_model_name}_{timestamp}.json'
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2)

        print(f"Saved {len(processed_outputs)} outputs to {filename}")
        return result
    else:
        print("No valid outputs to save.")
        return None



In [16]:
process_and_save_output_multiple(all_output, file_prefix = 'zero-shot-ner_smaller-chunks', prompt = ner_prompt)

Saved 24 outputs to zero-shot-ner_smaller-chunks_deepseek_deepseek-chat-v3-0324:free_20250502_184735.json


{'model_name': 'deepseek/deepseek-chat-v3-0324:free',
 'prompt': {'template': "Task: Perform Named Entity Recognition (NER) with a deep understanding of neuroscience-specific terminology. Use precise and domain-relevant entity labels.Return the output in a training-ready JSON object for spaCy NER training, provide the entity,start position, end position, and respective label.Here is an example output: ```json 'text': 'Drug addiction is a chronic','entities': ['entity': 'Drug addiction','start': 0,'end': 13,'label': 'DISORDER']```The text is provided here: {neuroscience_text}",
  'input_variables': ['neuroscience_text']},
 'output': [{'text': 'Drug addiction is a chronic, complex neuropsychiatric disorder characterized by a loss of control of drug-taking behaviors. The current drug addiction and overdose epidemic in the United States has been worsened by the COVID-19 pandemic as people struggle with social isolation and economic distress (Cisneros and Cunningham, 2021). Much of the curr