# FDA Text Cleaning Notebook

This notebook loads FDA regulation text files for Drugs, Medical Devices, and Biologics, performs cleaning, and saves outputs for analysis.

Run the cells top-to-bottom. If you need dependencies, use the next cell to install them in your environment.


## Import Libraries

In [17]:
from pathlib import Path
import os
import glob
import pandas as pd
import sys, subprocess

import re
import unicodedata
from typing import List
import pandas as pd
from tqdm.auto import tqdm

subprocess.check_call([sys.executable, "-m", "pip", "install", "tqdm"])
from tqdm.auto import tqdm
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow"])









0

In [10]:
# Setup and paths
BASE_DIR = Path("/Users/Kay Michnicki/AllCode/FDA Data Scraping")
TEXT_ROOT = BASE_DIR / "fda_output"
CATEGORIES = ["Drugs", "MedicalDevices", "Biologics"]
OUTPUT_DIR = BASE_DIR / "fda_output_cleaned"
OUTPUT_DIR.mkdir(exist_ok=True)

pd.set_option("display.max_colwidth", 200)
print("Base:", BASE_DIR)
print("Text root:", TEXT_ROOT)
print("Output:", OUTPUT_DIR)
import re
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "tqdm"])
from tqdm.auto import tqdm


Base: /Users/Kay Michnicki/AllCode/FDA Data Scraping
Text root: /Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output
Output: /Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output_cleaned


In [11]:
# Discover text files

def list_text_files():
    rows = []
    for cat in CATEGORIES:
        txt_dir = TEXT_ROOT / cat / "text"
        if not txt_dir.exists():
            continue
        for p in txt_dir.glob("*.txt"):
            rows.append({
                "category": cat,
                "file_path": str(p),
                "filename": p.name,
            })
    return pd.DataFrame(rows)

files_df = list_text_files()
print("Files found:", len(files_df))
files_df.head()


Files found: 1880


Unnamed: 0,category,file_path,filename
0,Drugs,/Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output/Drugs/text/PDF-239-KBPDF-239-KB-of-Certain-Ophthalmic-Products-Policy-Regarding-Compliance-With-21-CFR-Part-4-Guidance-for-Industry-Guidan...,PDF-239-KBPDF-239-KB-of-Certain-Ophthalmic-Products-Policy-Regarding-Compliance-With-21-CFR-Part-4-Guidance-for-Industry-Guidance-for-Industry_20251026_190614.txt
1,Drugs,/Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output/Drugs/text/PDF-21288-KBPDF-21288-KB-of-SUPAC-Manufacturing-Equipment-Addendum_20251026_191638.txt,PDF-21288-KBPDF-21288-KB-of-SUPAC-Manufacturing-Equipment-Addendum_20251026_191638.txt
2,Drugs,/Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output/Drugs/text/PDF-15726-KBPDF-15726-KB-of-Considerations-When-Transferring-Clinical-Investigation-Oversight-to-Another-IRB-Guidance-for-IRBs-...,PDF-15726-KBPDF-15726-KB-of-Considerations-When-Transferring-Clinical-Investigation-Oversight-to-Another-IRB-Guidance-for-IRBs-Clinical-Investigators-and-Sponsors_20251026_191705.txt
3,Drugs,/Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output/Drugs/text/PDF-9794-KBPDF-9794-KB-of-E18-Genomic-Sampling-and-Management-of-Genomic-Data-Guidance-for-Industry_20251026_191257.txt,PDF-9794-KBPDF-9794-KB-of-E18-Genomic-Sampling-and-Management-of-Genomic-Data-Guidance-for-Industry_20251026_191257.txt
4,Drugs,/Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output/Drugs/text/PDF-24089-KBPDF-24089-KB-of-Cross-Labeling-Oncology-Drugs-in-Combination-Regimens_20251026_190429.txt,PDF-24089-KBPDF-24089-KB-of-Cross-Labeling-Oncology-Drugs-in-Combination-Regimens_20251026_190429.txt


In [18]:
# Cleaning utilities
from typing import List


tqdm.pandas()

URL_PAT = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
PAGE_NUM_PAT = re.compile(r"^\s*(page|p\.)\s*\d+(\s*/\s*\d+)?\s*$", re.IGNORECASE)
MULTI_WS_PAT = re.compile(r"[ \t]{2,}")
HARD_BREAK_HYPHEN = re.compile(r"-\n")
INLINE_HYPHEN_WRAP = re.compile(r"(\w)-\n(\w)")


def normalize_unicode(text: str):
    return unicodedata.normalize("NFKC", text)


def strip_headers_footers(lines: List[str]):
    cleaned = []
    for ln in lines:
        if PAGE_NUM_PAT.match(ln.strip()):
            continue
        cleaned.append(ln)
    return cleaned


def clean_text(raw: str):
    x = normalize_unicode(raw)
    x = INLINE_HYPHEN_WRAP.sub(r"\1\2", x)
    x = HARD_BREAK_HYPHEN.sub("", x)
    x = URL_PAT.sub(" ", x)

    lines = x.splitlines()
    lines = strip_headers_footers(lines)
    x = "\n".join(lines)

    x = MULTI_WS_PAT.sub(" ", x)
    x = re.sub(r"\n{3,}", "\n\n", x)
    x = x.strip()
    return x


def read_text(path: str):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def build_clean_dataframe(df: pd.DataFrame):
    df = df.copy()
    df["raw_text"] = df["file_path"].progress_apply(read_text)
    df["clean_text"] = df["raw_text"].progress_apply(clean_text)
    df["char_len"] = df["clean_text"].str.len()
    df["word_count"] = df["clean_text"].str.split().str.len()
    return df

clean_df = build_clean_dataframe(files_df)
clean_df[["category","filename","word_count","char_len"]].head()


  0%|          | 0/1880 [00:00<?, ?it/s]

  0%|          | 0/1880 [00:00<?, ?it/s]

Unnamed: 0,category,filename,word_count,char_len
0,Drugs,PDF-239-KBPDF-239-KB-of-Certain-Ophthalmic-Products-Policy-Regarding-Compliance-With-21-CFR-Part-4-Guidance-for-Industry-Guidance-for-Industry_20251026_190614.txt,2812,19347
1,Drugs,PDF-21288-KBPDF-21288-KB-of-SUPAC-Manufacturing-Equipment-Addendum_20251026_191638.txt,10725,63151
2,Drugs,PDF-15726-KBPDF-15726-KB-of-Considerations-When-Transferring-Clinical-Investigation-Oversight-to-Another-IRB-Guidance-for-IRBs-Clinical-Investigators-and-Sponsors_20251026_191705.txt,7425,48079
3,Drugs,PDF-9794-KBPDF-9794-KB-of-E18-Genomic-Sampling-and-Management-of-Genomic-Data-Guidance-for-Industry_20251026_191257.txt,5421,37538
4,Drugs,PDF-24089-KBPDF-24089-KB-of-Cross-Labeling-Oncology-Drugs-in-Combination-Regimens_20251026_190429.txt,2419,16427


In [19]:
# Save cleaned outputs
tqdm.pandas()

from pathlib import Path

parquet_path = OUTPUT_DIR / "fda_cleaned.parquet"
cols = ["category","filename","file_path","clean_text","word_count","char_len"]
clean_df[cols].to_parquet(parquet_path, index=False)
print("Saved:", parquet_path)

# Optional: also write cleaned .txt files mirroring structure
WRITE_CLEAN_TXT = False  # set True to enable

def write_clean_texts(df: pd.DataFrame, root: Path):
    for cat, grp in df.groupby("category"):
        out_dir = root / cat / "text"
        out_dir.mkdir(parents=True, exist_ok=True)
        for _, row in grp.iterrows():
            out_path = out_dir / row["filename"]
            with open(out_path, "w", encoding="utf-8") as f:
                f.write(row["clean_text"])

if WRITE_CLEAN_TXT:
    write_clean_texts(clean_df, OUTPUT_DIR)
    print("Clean text files written under:", OUTPUT_DIR)


Saved: /Users/Kay Michnicki/AllCode/FDA Data Scraping/fda_output_cleaned/fda_cleaned.parquet


In [20]:
# Verification and sampling
# Shortest docs
short_df = clean_df.sort_values("word_count").head(10)[["category","filename","word_count"]]
print(short_df)

# Sample a few cleaned texts
sample = clean_df.sample(min(3, len(clean_df)), random_state=42)
for _, r in sample.iterrows():
    print("===", r["category"], r["filename"]) 
    print(r["clean_text"][:1000], "\n")

            category  \
347            Drugs   
1857       Biologics   
1112  MedicalDevices   
1390  MedicalDevices   
146            Drugs   
1666       Biologics   
710            Drugs   
1076  MedicalDevices   
1326  MedicalDevices   
1072  MedicalDevices   

                                                                                                                                                         filename  \
347                                                         PDF-4285-KBPDF-4285-KB-of-Industry-Supported-Scientific-and-Educational-Activites_20251026_192513.txt   
1857                                                        PDF-4285-KBPDF-4285-KB-of-Industry-Supported-Scientific-and-Educational-Activites_20251026_185758.txt   
1112                                                        PDF-4285-KBPDF-4285-KB-of-Industry-Supported-Scientific-and-Educational-Activites_20251026_194148.txt   
1390                                 PDF-4285-KBPDF-4285-KB-of-Industry-Sup