Rule-based extraction



In [None]:
import pdfplumber
import pandas as pd
from pdfplumber.utils import extract_text, get_bbox_overlap, obj_to_bbox
import re
import os
import json
import pdb

def regex_ignore_whitespace(pattern):
    # Split the pattern into words and rejoin them allowing any amount of whitespace
    words = pattern.split()
    return r"\s*".join(map(re.escape, words))

# Header removal
def remove_header_chars(page, header_height=100):
    return [char for char in page.chars if char['top'] > header_height]

# This function returns the cleaned page text (with header removed)
def get_cleaned_page_text(page):
    chars = remove_header_chars(page, header_height=100)
    
    # Remove residual header info (e.g. "Page xx of ...")
    end_header = None
    line_threshold = 7  # if the top coordinate increases by more than 7,  assume a new line
    for k in range(len(chars) - 3):
        if ("P" in chars[k]["text"] and "a" in chars[k+1]["text"] and
            "g" in chars[k+2]["text"] and "e" in chars[k+3]["text"]):
            initial_top = chars[k]["top"]
            i = k
            while i < len(chars) - 1 and end_header is None:
                # If I see "of" following some characters
                if "o" in chars[i]["text"] and "f" in chars[i+1]["text"]:
                    for j in range(i+2, len(chars)):
                        # Check if I've encountered a new line based on vertical position change
                        if chars[j]["top"] - initial_top > line_threshold:
                            end_header = j - 1
                            break
                        # Alternatively, if the text is not numeric and not whitespace, mark the end
                        if not (chars[j]["text"].isnumeric() or chars[j]["text"].isspace()):
                            end_header = j - 1
                            break
                i += 1
            if end_header is not None:
                # Remove header characters (pop in reverse order)
                for i in range(end_header, -1, -1):
                    chars.pop(i)
            break

    # Process tables: filter out table regions and append table data as markdown
    for table in page.find_tables():
        cropped = page.crop(table.bbox)
        # Check if there are any characters in the cropped area
        if not cropped.chars:
            continue  # Skip this table if no characters found 
        first_table_char = cropped.chars[0]
        filtered_page = page.filter(lambda obj: get_bbox_overlap(obj_to_bbox(obj), table.bbox) is None)
        chars = filtered_page.chars
        df = pd.DataFrame(table.extract())
        markdown = df.to_markdown(index=False)
        table_char = first_table_char.copy()
        table_char["text"] = markdown
        chars.append(table_char)

    page_text = extract_text(chars, layout=True, y_tolerance=6)
    page_text = page_text.replace("", "•").replace("\uf0b7", "•")
    return page_text



# Section detection patterns 
# Endpoints section
endpoints_sections = [
    "2 study objectives",
    "2. study objectives",
    "3 study endpoints",
    "3 objectives and endpoints",
    "3 objectives, endpoints and estimands",
    "3 objectives, endpoints and estimand",
    "3 objectives, endpoints",
    "3. study endpoints",
    "3. objectives and endpoints",
    "3. objectives, endpoints and estimands",
    "3. objectives, endpoints and estimand",
    "3. objectives, endpoints",
    "4 objectives and endpoints",
    "4 objective(s) and endpoint(s)",
    "4. objectives and endpoints",
    "4. objective(s) and endpoint(s)",
    "4.1 objective(s)",
    "4 objective(s) and endpoint(s)",
    "4. objective(s) and endpoint(s)",
    "4 objective(s) and endpoint(s)",
    "4 objective (s) and endpoint (s)",
    "4 objective.*?endpoint",
    "5 objectives, endpoints and estimands",
    "5. objectives, endpoints and estimands",
    "8 research question and objectives",
    "8. research question and objectives"
]

post_endpoints_sections = [
    "3 study description",
    "3. study description",
    "4 study design",
    "4 trial design",
    "4. study design",
    "4. trial design",
    "5 trial design",
    "5. trial design",
    "6 trial design",
    "6. trial design",
    "9.2 setting",
    "9.2. setting"
]

# Statistics section
stats_sections = [
    "6 statistical methods",
    "6. statistical methods",
    "9 statistics",
    "9 statistical considerations",
    "9.7.2 statistical methods",
    "9.7.2. statistical methods",
    "9. statistics",
    "9. statistical considerations",
    "10 statistical considerations",
    "10. statistical considerations",
    "11 statistical considerations",
    "11. statistical considerations",
    "16 statistical considerations",
    "16. statistical considerations",
    "17 statistical considerations",
    "17. statistical considerations"
]

post_stats_sections = [
    "7 adverse event collection",
    "7 adverse event (ae) reporting",
    "7. adverse event collection",
    "7. adverse event (ae) reporting",
    "9.8 quality control",
    "10 supporting documentation and operational considerations",
    "10 data management and record keeping",
    "10. supporting documentation and operational considerations",
    "10. data management and record keeping",
    "11 references",
    "11 appendices",
    "11. references",
    "11. appendices",
    "12 appendices",
    "12. appendices",
    "17 ethics",
    "17. ethics",
    "18 ethics",
    "18. ethics"
]

# Clinical laboratory section
clinical_lab_sections = [
    "10.2 Appendix 2: Clinical laboratory tests",
    "Appendix 2 Clinical laboratory tests",
    "Appendix 2: Clinical laboratory tests",
    "Appendix 2 Clinical laboratory assessments",
    "Appendix 2: Clinical laboratory assessments",
    "8.5.17 Laboratory assessments",
    "8.5 Laboratory assessments",
    "8.3.11 Laboratory assessments for safety",
    "8.4.6 Laboratory assessments for safety",
    "8.5.7 Blood samples for safety assessments",
]

# Special sections for clinical lab cases where the Appendix word is not readable. For this case, make sure that the stats section has been passed 
clinical_lab_special_sections = [
    "Clinical laboratory tests"
    ]


post_clinical_lab_sections = [
    "8.2.7 Pregnancy testing",
    "10.3 Appendix 3: Adverse Events and Serious Adverse Events: Definitions and procedures for recording, evaluating, follow-up, and reporting",
    "9.6 Pharmacokinetics",
    "Appendix 3 Trial governance considerations",
    "8.2.6 Pregnancy testing",
    "10.3 Appendix 3: Adverse events: Definitions and procedures for recording, evaluation, follow-up, and reporting",
    "8.5.18 Body weight and height",
    "8.9 Subject compliance",
    "8.3.12 Pregnancy testing",
    "8.6 Other assessments",
    "8.4.7 Pregnancy testing",
    "Appendix 3 Trial governance considerations",
    "9.4.6 Immunogenicity assessments",
    "9.5 Pharmacokinetics",
    "9.4.4 Eye examination",
    "9.4.8 Immunogenicity assessments",
    "9.6.9 Immunogenicity assessments",
    "8.2.5 Self-measured plasma glucose",
    "9.4.6 Injection site reactions",
    "9.4.5 Immunogenicity assessments",
    "8.3 Adverse events and serious adverse events",
    "8.3 Adverse events and other safety reporting",
    "8.2.5 Pregnancy testing",
    "8.2.5 Injection site reactions",
    "8.5.8 Anti-liraglutide antibodies",
    "Trial governance considerations",
    "Appendix 3: Adverse Events and Serious Adverse Events: Definitions and procedures for recording, evaluating, follow-up, and reporting",
    "Appendix 3: Adverse events: Definitions and procedures for recording, evaluation, follow-up, and reporting"

]

# Adverse events section
ae_sections = [
    "10.3.3 Description of AEs requiring additional data collection and other events requiring collection of additional information",
    "Table 8-1 AEs requiring additional data collection (serious and non-serious AEs)",
    "Description of AEs requiring additional data collection (via specific event form)",
    "10.3.3 Description of AEs requiring additional data collection",
    "12.1.5 Adverse events requiring additional data collection",
    "8.4.1.2 Adverse events requiring additional data collection",
    "8.4.1.1 Adverse events requiring additional data collection",
    "Table 9-1 AEs requiring additional data collection",
    "Table 9-1 AEs requiring additional data collection (via specific event form) and events for adjudication",
    "Table 9-1 AEs requiring additional data collection (via specific event form)",
    "Table 8-1 AEs requiring additional data collection",
    "Table 8-2 AEs requiring additional data collection and other events requiring collection of additional information",
    "Table 8-1 AEs requiring additional data collection and other events requiring additional data collection",
    "Table 8-1 AEs requiring additional data collection, events for adjudication and other events requiring collection of additional information",
    "Table 8-1 AEs requiring additional data collection (serious and non-serious AEs) and AESIs",
    "Table 8-3 AEs requiring additional data collection (serious and non-serious AEs) and AESIs",
    "Table 8-2 AEs requiring additional data collection (serious and non-serious AEs) and events for adjudication",
    "Table 6 AEs requiring additional data collection",
    "Table 8-1 AEs requiring additional data collection and other events requiring collection of additional information",
    "Table 8-1 Adverse events requiring additional data collection",
    "Table 8-2 AEs requiring additional data collection, events for adjudication, AESIs and other events requiring collection of additional information",
    "Table 8-1 AEs requiring additional data collection and AESIs",
    "8.6.3 Adverse events requiring specific event forms in the Ecrf",
    "Table 8-4 AEs requiring additional data collection",
    '"Table 8-2 AEs requiring additional data collection and other events requiring collection of additional information"',
    "Table 8-3 AEs requiring additional data collection and other events requiring collection of additional information",
    "Table 9-1 AEs requiring additional data collection (via specific event forms)",
    "Table 8-1 Events requiring additional data collection",
    "Table 8-2 AEs requiring additional data collection, events for adjudication, and other events requiring collection of additional information",
    "Table 10-2 AEs requiring additional data collection (via specific event form)",
    "Table 8-2 AEs requiring additional data collection and events for adjudication",
    "Table 4 AEs requiring additional data collection (via specific event form)",
    "Table 8-1 AEs requiring additional data collection and events for adjudication in main phase"
]

post_ae_sections = [
    "10.3.4 Recording and follow-up of AE and/or SAE",
    "8.3.1 Time period and frequency for collecting AE information",
    "AE and SAE recording",
    "8.3.1 Time period and frequency for collecting AE and SAE information",
    "12.1.6 Technical complaints",
    "8.4.2 Hypoglycaemic episodes",
    "8.4.2 Physical examination",
    "9.2.1.1 Events for adjudication",
    "9.2.1.1 Event for adjudication",
    "9.2.2 Method of detecting AEs and SAEs",
    "9.4.2 Method of detecting AEs and SAEs",
    "8.4.1 Time period and frequency for collecting AE and SAE information",
    "8.5.1 Time period and frequency for collecting AE and SAE information",
    "8.4.1 Time period and frequency for collecting AE information",
    "8.3.1 Time period and frequency for collecting adverse event information",
    "8.7 Other assessments",
    "8.5.1 Time period and frequency for collecting AE information",
    "9.2.1 Time period and frequency for collecting AE and SAE information",
    "10.2.2 Method of detecting AEs and SAEs",
    "9.2.2 Method of detecting AEs and SAEs",
    "8.4.1.3 Assessments in case of suspicion of acute pancreatitis"
]

def process_pdf_sections(pdf_path):
    pdf = pdfplumber.open(pdf_path)
    collected_text = []  #  store extracted text
    collecting_section = None  # can be: endpoints, statistics, clinical_lab, adverse_events
    toc_skipped = False  # flag to skip TOC pages
    endpoints_passed = False  # flag to indicate I've passed the endpoints section
    stats_passed = False      # flag to indicate I've passed the statistics section

    # Dictionary to track if sections were found
    found_sections = {"endpoints": False, "statistics": False, "clinical_lab": False, "adverse_events": False}

    for page in pdf.pages:
        cleaned_text = get_cleaned_page_text(page)
        text_lower = cleaned_text.lower()

        # Skip TOC pages until a marker is found (using post_stats_sections as proxy)
        if not toc_skipped:
            for pattern in post_stats_sections:
                if re.search(regex_ignore_whitespace(pattern), text_lower, re.IGNORECASE):
                    toc_skipped = True
                    break
            if not toc_skipped:
                continue
            else:
                continue  # skip the triggering page

        page_to_append = None

        # If not already in a section, check for a section start marker in order.
        if collecting_section is None:
            # Check endpoints markers
            for pattern in endpoints_sections:
                m = re.search(regex_ignore_whitespace(pattern), text_lower, re.IGNORECASE)
                if m:
                    collecting_section = "endpoints"
                    collected_text.append("### Endpoints Section\n")
                    page_to_append = cleaned_text[m.start():]
                    endpoints_passed = True
                    found_sections["endpoints"] = True
                    break
            # Check statistics markers if endpoints not found
            if collecting_section is None:
                for pattern in stats_sections:
                    m = re.search(regex_ignore_whitespace(pattern), text_lower, re.IGNORECASE)
                    if m:
                        collecting_section = "statistics"
                        collected_text.append("### Statistical Considerations Section\n")
                        page_to_append = cleaned_text[m.start():]
                        stats_passed = True
                        found_sections["statistics"] = True
                        break
            # Check clinical lab markers if endpoints have passed
            if collecting_section is None and endpoints_passed:
                for pattern in clinical_lab_sections:
                    m = re.search(regex_ignore_whitespace(pattern), text_lower, re.IGNORECASE)
                    if m:
                        collecting_section = "clinical_lab"
                        collected_text.append("### Clinical Laboratory Section\n")
                        page_to_append = cleaned_text[m.start():]
                        found_sections["clinical_lab"] = True
                        break
            # Check adverse events markers if endpoints have passed
            if collecting_section is None and endpoints_passed:
                for pattern in ae_sections:
                    m = re.search(regex_ignore_whitespace(pattern), text_lower, re.IGNORECASE)
                    if m:
                        collecting_section = "adverse_events"
                        collected_text.append("### Adverse Events Section\n")
                        page_to_append = cleaned_text[m.start():]
                        found_sections["adverse_events"] = True
                        break
            # Check clinical lab special cases if statistics have passed and still not found
            if collecting_section is None and stats_passed:
                for pattern in clinical_lab_special_sections:
                    m = re.search(regex_ignore_whitespace(pattern), text_lower, re.IGNORECASE)
                    if m:
                        collecting_section = "clinical_lab"
                        collected_text.append("### Clinical Laboratory Section\n")
                        page_to_append = cleaned_text[m.start():]
                        found_sections["clinical_lab"] = True
                        break
        # If already inside a section, check for an end-of-section marker.
        if collecting_section is not None:
            end_patterns = []
            if collecting_section in ["endpoints", "statistics"]:
                end_patterns = post_endpoints_sections + post_stats_sections
            elif collecting_section == "clinical_lab":
                end_patterns = post_clinical_lab_sections
            elif collecting_section == "adverse_events":
                end_patterns = post_ae_sections

            for pattern in end_patterns:
                m_end = re.search(regex_ignore_whitespace(pattern), text_lower, re.IGNORECASE)
                if m_end:
                    page_to_append = cleaned_text[:m_end.start()]
                    collecting_section = None
                    break

            # If no end marker found, include the entire page.
            if collecting_section is not None and page_to_append is None:
                page_to_append = cleaned_text

        if page_to_append is not None:
            collected_text.append(page_to_append.strip())

    pdf.close()
    extracted = "\n\n".join(collected_text) if collected_text else "Not Found"
    return extracted, found_sections

# iterate through all files and write JSONL 
folder = os.path.join("data", "included_protocols_after_place_revision")
output_jsonl = "fine_tuning_data.jsonl"

with open(output_jsonl, "w", encoding="utf-8") as out_f:
    for filename in os.listdir(folder):
        if filename.lower().endswith(".pdf"):
            print("Processing:", filename)
            pdf_path = os.path.join(folder, filename)
            extracted_text, found_sections = process_pdf_sections(pdf_path)

            # Check if any required section is not found
            missing = [section for section, found in found_sections.items() if not found]
            if missing:
                print("  Missing sections:", ", ".join(missing))
            record = {"filename": filename, "text": extracted_text}
            out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"JSONL file created: {output_jsonl}")


Processing: 4300-protocol-version-4.pdf
Processing: 4303-protocol-version-3.pdf
Processing: 4309-protocol-version-3.0.pdf
Processing: 4316-protocol-version-4 - final.pdf
Processing: 4338-protocol-version-4.0.pdf
Processing: 4373-protocol-version-4.0.pdf
Processing: 4378-protocol-version-3.0.pdf
Processing: 4379-protocol-version-4.0.pdf
Processing: 4386-protocol-version-1.0.pdf
Processing: 4451-protocol-version-1.0.pdf
Processing: 4462-protocol-version-6.pdf
Processing: 4486-protocol-version-7.0.pdf
Processing: 4492-protocol-version-3.pdf
Processing: 4518-protocol-version-4.pdf
Processing: 4601-protocol  version 4.0.pdf
Processing: 4669-protocol-version-7.0.pdf
Processing: 4748-protocolv3.pdf
Processing: 4774-protocol-v3.0.pdf
Processing: 4885-protocol-version-4.pdf
Processing: 4921-protocol-version-3.0.pdf
Processing: 4924-protocol-v1.0.pdf
Processing: 7611 protocol v1.0_22FEB2024.pdf
Processing: 7663-protocol-version-1.0.pdf
Processing: nn1218-4357.pdf
Processing: nn1436-4479.pdf
Proc