LLM-based extraction

In [None]:
import os
import re
import json
from docling.document_converter import DocumentConverter
import time
import sys
import openai

OPENAI_KEY  = os.getenv("API_KEY")
EMBED_MODEL = "openai_text_embedding_3_small"
CHAT_MODEL  = "anthropic_claude_3_7_sonnet_v1_0"
BASE_URL    = "https://api.marketplace.novo-genai.com/v1"


openai.api_key = OPENAI_KEY
openai_client = openai.OpenAI(api_key=OPENAI_KEY, base_url=BASE_URL)

start_time = time.time()

folder = "data/included_protocols_after_place_revision"
output_path = "llm_extraction.jsonl"

# Section extraction targets and prompts
sections = [
    ("Objectives and endpoints", "Extract and print the full Objectives and endpoints section (not a summary) from the context below. Use only the information from that section of the protocol, nothing else. Do not output any other information, just the text. "),
    ("Statistical considerations (or analysis)", "Extract and print the full Statistical considerations section (not a summary) from the context below. Use only the information from that section of the protocol, nothing else. Do not output any other information, just the text."),
    ("Adverse events (AEs) requiring additional data collection", "Extract and print the full Adverse events (AEs) requiring additional data collection section (not a summary) from the context below. Use only the information from that section of the protocol, nothing else. Do not output any other information, just the text."),
    ("Clinical safety laboratory assessments (or tests)", 
        "Extract and print the full 'Clinical safety laboratory assessments (or tests)' section from the protocol text below. "
        "The detailed information and tables may appear either in the main body of the protocol (usually in section 8 or 9) or in an appendix (for example, Appendix 2). "
        "If the main section refers to an appendix for details, make sure to find and include the detailed appendix content as well. "
        "If the full section is split between the main section and an appendix, combine all relevant information, tables, and lists into a single, coherent section. "
        "Do not summarize or skip anything, extract the section as it appears in the protocol. Do not output any other information, just the text. "
    ),
]

# Regex for cutting before section of Objectives and endpoints
keywords = r"objective|endpoint|estimand|research question"
pattern = re.compile(
    rf"^#+?\s*\d+\s*[\w\s,()]*?({keywords})[\w\s,()]*?$",
    re.IGNORECASE | re.MULTILINE
)

with open(output_path, "w", encoding="utf-8") as fout:
    processed_count = 0
    for filename in os.listdir(folder):
        if not filename.lower().endswith(".pdf"):
            continue
        file_path = os.path.join(folder, filename)
        print(f"Processing: {filename}")

        try:
            doc = DocumentConverter().convert(file_path).document
            md = doc.export_to_markdown()
        except Exception as e:
            print(f"Could not process {filename}: {e}")
            continue

        # Remove front matter before objectives/endpoints/estimands
        matches = list(pattern.finditer(md))
        print("Matches found:")
        for match in matches:
            print(match.group())
        if matches:
            start_pos = max(0, matches[0].start() - 300)  # Include some context before the match
            md = md[start_pos:]

        section_texts = []
        for section, prompt_prefix in sections:
            prompt = (f"{prompt_prefix}\n\nProtocol text:\n\n{md}")

            try:
                res = openai_client.chat.completions.create(
                    model=CHAT_MODEL,
                    messages=[{"role":"user", "content": prompt}],
                    temperature=0,
                )
                result_text = res.choices[0].message.content.strip()
            except Exception as e:
                print(f"Failed LLM call for {section} in {filename}: {e}")
                result_text = ""

            pretty_section = section.split("(")[0].strip().capitalize()
            if "clinical safety" in section.lower():
                section_header = "Clinical laboratory section"
            elif "objectives" in section.lower():
                section_header = "Endpoints Section"
            elif "statistical" in section.lower():
                section_header = "Statistical considerations section"
            elif "adverse events" in section.lower():
                section_header = "Adverse events section"
            else:
                section_header = pretty_section + " Section"
            section_texts.append(f"### {section_header}\n\n{result_text}")

        json_entry = {
            "filename": filename,
            "text": "\n\n".join(section_texts)
        }
        fout.write(json.dumps(json_entry, ensure_ascii=False) + "\n")
        processed_count += 1

   

print("Done! Wrote:", output_path)
print(f"Processed {processed_count} files.")
print(f"Total time: {time.time() - start_time:.2f} seconds")


  from .autonotebook import tqdm as notebook_tqdm
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Processing: 4300-protocol-version-4.pdf


Downloading recognition model, please wait. This may take several minutes depending upon your network connection.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Matches found:
## 4 Objectives and endpoints
Processing: 4303-protocol-version-3.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4309-protocol-version-3.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4316-protocol-version-4 - final.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4338-protocol-version-4.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4373-protocol-version-4.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4378-protocol-version-3.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4379-protocol-version-4.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4386-protocol-version-1.0.pdf




Matches found:
## 4 Objectives and endpoints 415
Processing: 4451-protocol-version-1.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4462-protocol-version-6.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4486-protocol-version-7.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4492-protocol-version-3.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4518-protocol-version-4.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4601-protocol  version 4.0.pdf




Matches found:
## 4 Objectives and endpoints
Processing: 4669-protocol-version-7.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: 4748-protocolv3.pdf




Matches found:
## 3 Objectives and endpoints
Processing: 4774-protocol-v3.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: 4885-protocol-version-4.pdf




Matches found:
## 3 Objectives and endpoints
Processing: 4921-protocol-version-3.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: 4924-protocol-v1.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: 7611 protocol v1.0_22FEB2024.pdf




Matches found:
## 3 Objectives and endpoints
Processing: 7663-protocol-version-1.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn1218-4357.pdf




Matches found:
## 4 Objectives and endpoints
Processing: nn1436-4479.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1436-4480.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1436-4570.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1436-4571.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1436-4572.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1436-4909.pdf




Matches found:
## 3 Objectives, endpoints and estimand
Processing: nn1436-7724.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn1471-4612.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1471-4752.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1535-4591.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn1535-4592.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn1535-4710.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn1535-4988.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn6018-4889.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn6018-4951.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn6019-4940.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn6022-7683.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn6435-4697.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn6435-4749.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn6435-4826.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn6535-7519.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn6537-7650.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn6582-4838.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7088-4595.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7533-4470.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7533-7587.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7535-7702.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7535-7703.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7535-7704.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn7535-7807.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn7535-7976.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7614-7656.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7769-4516.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7769-4532.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn7769-4992.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn7999-4670.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn8022-4179.pdf




Matches found:
## 4 Objectives and endpoints
Processing: nn8640-4469.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9388-4895.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9388-7637.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9388-7700.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9389-4606.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: NN9389-4679-protocol-3.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9389-4680.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9389-4681.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9389-4682.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9487-5022.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9487-7573.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9487-7612.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9487-7980.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9490-7678.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9500-4620.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9500-4621.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9500-4796.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9500-4932.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9501-4869.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9501-5006.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9515-7675.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9535-4430.pdf




Matches found:
## 4 Objectives and endpoints
Processing: nn9535-4533.pdf




Matches found:
## 3 Objectives and endpoints
Processing: NN9535-4801 protocol v2.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9535-4820.pdf




Matches found:
## 3 Objectives, endpoints and estimand
Processing: nn9535-7560.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9536-4576.pdf




Matches found:
## 4 Objectives and endpoints
Processing: nn9536-4578.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9536-4707.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9536-4741.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9536-4999.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9536-7545.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9541-4922.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9541-4923.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9541-4945.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9541-5015.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9650-5027.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9662-7694sad.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9775-4708b.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9838-4615.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9838-4672.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9838-4695.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9838-4862.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9838-7832.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9838-8259.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9904-4825sad.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9924-4556.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9924-4891.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9924-4977.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9932-4737.pdf




Matches found:
## 3 Objectives and endpoints
Processing: nn9932-4861.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: nn9932-4873.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: Protocol  4468-trial-protocol-final-version-1.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: Protocol  4569 protocol vers 4.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: Protocol  4663-protocol-version-3.0.pdf




Matches found:
## 3 Objectives and endpoints
Processing: Protocol  4738 protocol v2.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: Protocol  NN8022-4392 Trial protocol ver.4.0.pdf




Matches found:
## 5 Objectives, endpoints and estimands
Processing: Protocol  NN9838-4609 protocol version 7.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: Protocol  REAL4 protocol v10.pdf




Matches found:
## 4 Objectives and endpoints
Processing: Protocol Amendment  NN9838-4608 protocol v5.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: Protocol Amendment  NN9838-4762 protocol v2.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: protocol-4706-version-1.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: Trial 4662 protocol_v3.0.pdf
Matches found:
## 3 Objectives and endpoints
Processing: _ Protocol  4910 protocol.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: _ Protocol  4954 protocol v.1.0.pdf




Matches found:
## 3 Objectives, endpoints and estimands
Processing: _ Protocol Amendment  protocol version 2.0 (1).pdf




Matches found:
## 3 Objectives and endpoints
Done! Wrote: llm_extraction.jsonl
Processed 127 files.
Total time: 41540.51 seconds
