In [1]:
import os
from litellm import completion
from dotenv import load_dotenv
from tqdm import tqdm
import textwrap
from GrobidArticleExtractor import GrobidArticleExtractor
from langchain.prompts import PromptTemplate
from entity_extraction import ChatOpenRouter, parse_pdf,create_chain, run_chain_on_small_chunks,batch_extract_and_format

In [2]:
load_dotenv()
OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"] 
GROBID_SERVER_URL = os.environ["GROBID_SERVER_URL"]

In [3]:
grobid_client = GrobidArticleExtractor(GROBID_SERVER_URL)


In [4]:
phillips_text = parse_pdf('data/Phillips_2023_11pg.pdf', grobid_client)

In [5]:
in_place_tagging_prompt = PromptTemplate(
    input_variables=["neuroscience_text"],
    template=textwrap.dedent("""\
    **Background**
    You are a highly skilled neuroscience expert with extensive experience in named entity recognition and research publication entity annotation.
    
    **Task**
    Meticulously extract and classify all relevant entities into the most appropriate and specific category from the provided neuroscience text. 
    The list of entities extracted should be exhaustive and comprehensive. You should identify and label all entities, including duplicates.                                                             
    Please ensure that all entities are accurately identified and classified according to their role within the neuroscience domain. 
    IMPORTANT: DO NOT modify the input text in any way. Use the text as is, without any alterations or corrections. 
    
    **Output Format**
    Return only the full text with in‑place annotations.
    The format for the annotations is as follows:
    (entity)[LABEL]
    Where entity is the extracted entity (not altered or modified), and LABEL is an UPPERCASE, underscore‑style category.
    All extracted entities should be in parentheses, followed by the label in square brackets.
    
    **Example**
    Input Text: "Histamine is a conserved neuromodulator"
    Output: "(Histamine)[AMINE] is a conserved (neuromodulator)[HORMONE]"
                                 
    **Input Text**:
    {neuroscience_text}
    """
    )
)          

In [6]:
llm_gpt_4o_mini = ChatOpenRouter(model_name='openai/gpt-4o-mini')
llm_gemini_20_flash = ChatOpenRouter(model_name='google/gemini-2.0-flash-001')
llm_claude_37_sonnet = ChatOpenRouter(model_name='anthropic/claude-3.7-sonnet')
llm_deepseek_v3 = ChatOpenRouter(model_name='deepseek/deepseek-chat-v3-0324:free')


In [None]:
gpt_ner_chain = create_chain(
    llm=llm_gpt_4o_mini,
    prompt=in_place_tagging_prompt
)
phillips_gpt_ner_output = run_chain_on_small_chunks(phillips_text, gpt_ner_chain)
batch_extract_and_format(phillips_gpt_ner_output, model_name='gpt-4o-mini', file_prefix='phillips', save_to_file=True, prompt=in_place_tagging_prompt)

In [7]:
gemini_ner_chain = create_chain(
    llm=llm_gemini_20_flash,
    prompt=in_place_tagging_prompt
)
phillips_gemini_ner_output = run_chain_on_small_chunks(phillips_text, gemini_ner_chain)
batch_extract_and_format(phillips_gemini_ner_output, model_name='gemini-2.0-flash-001', file_prefix='phillips', save_to_file=True, prompt=in_place_tagging_prompt)

Running NER on sections: 100%|██████████| 12/12 [01:30<00:00,  7.55s/it]

Results saved to phillips_gemini-2.0-flash-001_2025-05-08_23-42-35.json





In [8]:
claude_ner_chain = create_chain(
    llm=llm_claude_37_sonnet,
    prompt=in_place_tagging_prompt
)
phillips_claude_ner_output = run_chain_on_small_chunks(phillips_text, claude_ner_chain)
batch_extract_and_format(phillips_claude_ner_output, model_name='claude-3.7-sonnet', file_prefix='phillips', save_to_file=True, prompt=in_place_tagging_prompt)

Running NER on sections: 100%|██████████| 12/12 [04:01<00:00, 20.16s/it]

Results saved to phillips_claude-3.7-sonnet_2025-05-08_23-47-46.json





In [9]:
deepseek_ner_chain = create_chain(
    llm=llm_deepseek_v3,
    prompt=in_place_tagging_prompt
)
phillips_deepseek_ner_output = run_chain_on_small_chunks(phillips_text, deepseek_ner_chain)
batch_extract_and_format(phillips_deepseek_ner_output, model_name='deepseek-chat-v3-0324', file_prefix='phillips', save_to_file=True, prompt=in_place_tagging_prompt)

Running NER on sections: 100%|██████████| 12/12 [20:48<00:00, 104.03s/it]

Results saved to phillips_deepseek-chat-v3-0324_2025-05-09_00-09-01.json



