In [None]:
import os
import re
from pathlib import Path
import pandas as pd

input_dir = Path(r"C:\Users\FahRe\Desktop\Apotheker-Agents\texts")
output_dir = Path(r"C:\Users\FahRe\Desktop\Apotheker-Agents\texts_cleaned")
output_dir.mkdir(exist_ok=True)

metadata_path = Path(r"C:\Users\FahRe\Downloads\Medication Guides.csv")
df_metadata = pd.read_csv(metadata_path)
df_metadata.set_index("Appl. No.", inplace=True)

SECTION_HEADERS = [
    "HIGHLIGHTS OF PRESCRIBING INFORMATION",
    "RECENT MAJOR CHANGES",
    "INDICATIONS AND USAGE",
    "DOSAGE AND ADMINISTRATION",
    "DOSAGE FORMS AND STRENGTHS",
    "CONTRAINDICATIONS",
    "WARNINGS AND PRECAUTIONS",
    "ADVERSE REACTIONS",
    "DRUG INTERACTIONS",
    "USE IN SPECIFIC POPULATIONS",
    "DRUG ABUSE AND DEPENDENCE",
    "OVERDOSAGE",
    "DESCRIPTION",
    "CLINICAL PHARMACOLOGY",
    "NONCLINICAL TOXICOLOGY",
    "HOW SUPPLIED/STORAGE AND HANDLING",
    "PATIENT COUNSELING INFORMATION",
]

FUZZY_REPLACEMENTS = {
    r"\bMAJ OR\b": "MAJOR",
    r"\bCOUN SELING\b": "COUNSELING",
    r"\bCONTRA INDICATIONS\b": "CONTRAINDICATIONS",
    r"\bNONCLIN ICAL\b": "NONCLINICAL",
    r"\bSUP PLIED\b": "SUPPLIED",
    r"\bADMIN IST RATION\b": "ADMINISTRATION",
    r"\bINFOR MATION\b": "INFORMATION",
    r"\bUSE IN SPECIFIC POPUL ATIONS\b": "USE IN SPECIFIC POPULATIONS",
}
section_patterns = [
    re.compile(rf"(?:^|\n)[#\-\s]*{re.escape(header)}[#\-\s]*\n?", re.IGNORECASE)
    for header in SECTION_HEADERS
]

def clean_and_tag_text(text: str) -> str:
    text = re.sub(r"Reference ID: \d+", "", text)
    text = re.sub(r"This label may not be the latest.*?drugsatfda", "", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r"(\S)\n(\S)", r"\1 \2", text) 
    text = re.sub(r"\n{3,}", " ", text)  
    text = re.sub(r"[ \t]{3,}", " ", text)  #
    for broken, fixed in FUZZY_REPLACEMENTS.items():
        text = re.sub(broken, fixed, text, flags=re.IGNORECASE)
    for pattern, header in zip(section_patterns, SECTION_HEADERS):
        text = pattern.sub(f"\n\n### {header.upper()} ###\n\n", text)
    return text.strip()

results = []
for txt_file in input_dir.glob("*.txt"):
    appl_no = txt_file.stem
    try:
        with open(txt_file, "r", encoding="utf-8") as f:
            raw_text = f.read()

        if not raw_text.strip() or raw_text.startswith("[Error"):
            continue

        cleaned_text = clean_and_tag_text(raw_text)
        output_file = output_dir / f"{appl_no}.txt"
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        if int(appl_no) in df_metadata.index:
            row = df_metadata.loc[int(appl_no)]
            results.append({
                "application_no": appl_no,
                "cleaned_path": str(output_file),
                "drug_name": row["Drug Name"],
                "active_ingredient": row["Active Ingredient"],
                "form": row["Form; Route"],
                "company": row["Company"],
                "date": row["Date"],
                "url": row["URL"]
            })

    except Exception as e:
        print(f"Error processing {txt_file.name}: {e}")

df_results = pd.DataFrame(results)



In [29]:
df_results

Unnamed: 0,application_no,cleaned_path,drug_name,active_ingredient,form,company,date,url
0,10187,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,RITALIN,METHYLPHENIDATE HYDROCHLORIDE,TABLET;ORAL,NOVARTIS,02/04/2025,https://www.accessdata.fda.gov/drugsatfda_docs...
1,103000,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,Appl. No. 103000 BOTOX 103000 B...,Appl. No. 103000 OnabotulinumtoxinA 103000 ...,Appl. No. 103000 VIAL; SINGLE-USE 103000 ...,Appl. No. 103000 ALLERGAN 103000 ALLERGA...,Appl. No. 103000 08/10/2023 103000 08/10...,Appl. No. 103000 https://www.accessdata.fda...
2,103132,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,INTRON A,INTERFERON ALFA-2B,VIAL,SCHERING,11/26/2021,https://www.accessdata.fda.gov/drugsatfda_docs...
3,103234,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,Appl. No. 103234 EPOGEN 103234 PROCRIT ...,Appl. No. 103234 EPOETIN ALFA 103234 EPO...,Appl. No. 103234 VIAL; SINGLE-USE 103234 ...,Appl. No. 103234 AMGEN 103234 AMGEN Name...,Appl. No. 103234 04/30/2024 103234 04/30...,Appl. No. 103234 https://www.accessdata.fda...
4,103471,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,BETASERON,INTERFERON BETA-1B,VIAL; SUBCUTANEOUS,BAYER HEALTHCARE PHARMS,07/28/2023,https://www.accessdata.fda.gov/drugsatfda_docs...
...,...,...,...,...,...,...,...,...
865,87313,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,LINDANE,LINDANE,LOTION;TOPICAL,OLTA PHARMS,09/04/2001,https://www.accessdata.fda.gov/drugsatfda_docs...
866,8762,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,Appl. No. 8762 DILANTIN-125 8762 DILANT...,Appl. No. 8762 PHENYTOIN 8762 PHENYTOIN ...,Appl. No. 8762 SUSPENSION;ORAL 8762 SUSP...,Appl. No. 8762 UPJOHN 8762 UPJOHN Name: ...,Appl. No. 8762 03/03/2022 8762 03/03/202...,Appl. No. 8762 https://www.accessdata.fda.g...
867,87846,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,Appl. No. 87846 IMIPRAMINE HYDROCHLORIDE 87...,Appl. No. 87846 IMIPRAMINE HYDROCHLORIDE 87...,Appl. No. 87846 TABLET;ORAL 87846 TABLET...,Appl. No. 87846 SPECGX LLC 87846 SPECGX ...,Appl. No. 87846 06/29/2017 87846 06/29/2...,Appl. No. 87846 https://www.accessdata.fda....
868,9170,C:\Users\FahRe\Desktop\Apotheker-Agents\texts_...,MYSOLINE,PRIMIDONE,TABLET;ORAL,VALEANT,06/10/2020,https://www.accessdata.fda.gov/drugsatfda_docs...
