In [2]:
import os
from litellm import completion
from dotenv import load_dotenv
from tqdm import tqdm
import textwrap
from GrobidArticleExtractor import GrobidArticleExtractor
from langchain.prompts import PromptTemplate
from entity_extraction import ChatOpenRouter, parse_pdf, get_model_output, process_and_save_output, create_chain, run_chain_on_small_chunks, process_and_save_output_multiple

In [3]:
load_dotenv()
OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"] 
GROBID_SERVER_URL = os.environ["GROBID_SERVER_URL"]

In [4]:
grobid_client = GrobidArticleExtractor(GROBID_SERVER_URL)


In [None]:
lin_text = parse_pdf('data/Lin_2023_12pg.pdf', grobid_client)

In [6]:
ner_prompt = PromptTemplate(
    input_variables=["neuroscience_text"],
    template=textwrap.dedent("""\
    **Background**
    You are a highly skilled neuroscience expert with extensive experience in named entity recognition and research publication entity annotation.
    
    **Task**
    Meticulously extract and classify all relevant entities into the most appropriate and specific category from the provided neuroscience text. 
    The list of entities extracted should be exhaustive and comprehensive. You should identify and label all entities, including duplicates.                                                             
    Please ensure that all entities are accurately identified and classified according to their role within the neuroscience domain. 
    Be precise with the start and end character positions.
    
    **Output Format**
    The final output should be formatted as a training-ready JSON object suitable for spaCy's Named Entity Recognition (NER) training.
    Each object in the 'entities' list should contain the 'entity', 'start', 'end', and 'label' keys.
    All entity labels must be in UPPERCASE and use underscores for spaces (e.g., 'BRAIN_REGION' not 'brain region').
    Return the result in the following JSON format:
    ```json
    {{
      "text": "<original_input_text>",
      "entities": [
        {{
          "entity": "<surface_form>",
          "start": <start_index>,
          "end": <end_index>,
          "label": "<ENTITY_TYPE",
          
        }},
        ...
      ]
    }}
    ```
                                                   
     **Input Text:** {neuroscience_text}
    """
    )
)

In [12]:
llm_gpt_4o_mini = ChatOpenRouter(model_name='openai/gpt-4o-mini')
ner_chain_gpt = create_chain(llm_gpt_4o_mini, ner_prompt)
lin_output_gpt = run_chain_on_small_chunks(lin_text, ner_chain_gpt)
lin_formatted_output_gpt = process_and_save_output_multiple(lin_output_gpt, file_prefix = 'lin', prompt = ner_prompt)

Running NER on sections: 100%|██████████| 10/10 [10:53<00:00, 65.40s/it]

Saved 30 outputs to lin_openai_gpt-4o-mini_20250507_192255.json





In [13]:
llm_gemini_20_flash = ChatOpenRouter(model_name='google/gemini-2.0-flash-001')
ner_chain_gemini = create_chain(llm_gemini_20_flash, ner_prompt)
lin_output_gemini = run_chain_on_small_chunks(lin_text, ner_chain_gemini)
lin_formatted_output_gemini = process_and_save_output_multiple(lin_output_gemini, file_prefix = 'lin', prompt = ner_prompt)

Running NER on sections: 100%|██████████| 10/10 [04:27<00:00, 26.71s/it]

Saved 30 outputs to lin_google_gemini-2.0-flash-001_20250507_192808.json





In [14]:
llm_deepseek_v3 = ChatOpenRouter(model_name='deepseek/deepseek-chat-v3-0324:free')
ner_chain_deepseek = create_chain(llm_deepseek_v3, ner_prompt)
lin_output_deepseek = run_chain_on_small_chunks(lin_text, ner_chain_deepseek)
lin_formatted_output_deepseek = process_and_save_output_multiple(lin_output_deepseek, file_prefix = 'lin', prompt = ner_prompt)

Running NER on sections: 100%|██████████| 10/10 [34:40<00:00, 208.08s/it]

Saved 30 outputs to lin_deepseek_deepseek-chat-v3-0324:free_20250507_200333.json





In [30]:
llm_claude_37_sonnet = ChatOpenRouter(model_name='anthropic/claude-3.7-sonnet')
ner_chain_claude = create_chain(llm_claude_37_sonnet, ner_prompt)
lin_output_claude = run_chain_on_small_chunks(lin_text, ner_chain_claude)
lin_formatted_output_claude = process_and_save_output_multiple(lin_output_claude, file_prefix = 'lin', prompt = ner_prompt)

Running NER on sections: 100%|██████████| 10/10 [15:25<00:00, 92.59s/it]

Saved 30 outputs to lin_anthropic_claude-3.7-sonnet_20250507_214630.json





In [None]:
llm_qwen_32b = ChatOpenRouter(model_name='qwen/qwen3-32b')
ner_chain_qwen = create_chain(llm_qwen_32b, ner_prompt)
lin_output_qwen = run_chain_on_small_chunks(lin_text, ner_chain_qwen)