In [None]:
# IMPORTANT:
# This code requires the WRDS-hosted Jupyter environment to run.
# It will NOT work on a local machine because it relies on WRDS internal paths
# and direct filesystem access to SEC archives.
#
# Link: https://wrds-jupyter.wharton.upenn.edu/
#
# Before you start, go to /scratch/oregonstate/ and create your folder
# In this code, it uses kuohsu as the folder name. You can press Ctrl + F to replace it with your own folder name
#
# Required input:
#   fname_list.csv  → must contain column: "path" OR "fname" (SEC public paths)
#
# Output (CLEAN ONLY):
#   /scratch/oregonstate/kuohsu/filings_text/
#   - Only readable .txt files (not raw)
#   - Keeps primary filing doc + finance/legal exhibits (EX-10.*, EX-99.*)
#   - Drops junk docs (GRAPHIC, EX-101.*, XML/XBRL, PDF/ZIP/EXCEL, etc.)
#
# Archive:
#   /scratch/oregonstate/kuohsu/filings_text.zip
#
# Notes:
# - Files in /scratch are temporary (~1 week retention)
# - Only errors are printed (NO MATCH / MISSING / FAILED)

import os
import re
import glob
import html as ihtml
import shutil
import pandas as pd
import wrds
from bs4 import BeautifulSoup
import subprocess



# HTML -> text cleanup
def strip_html_to_text(s: str) -> str:
    if not s:
        return ""

    # Decode HTML entities (&nbsp; etc.)
    s = ihtml.unescape(s)

    # Parse HTML (lxml is faster/more robust; fallback to html.parser if needed)
    try:
        soup = BeautifulSoup(s, "lxml")
    except Exception:
        soup = BeautifulSoup(s, "html.parser")

    # Remove scripts/styles/etc.
    for tag in soup(["script", "style", "noscript", "svg"]):
        tag.decompose()

    # Optional: images usually add no useful text
    for tag in soup(["img"]):
        tag.decompose()

    # Preserve structure: convert common tags into line breaks
    for br in soup.find_all("br"):
        br.replace_with("\n")

    for tagname in ["p", "div", "tr", "li", "h1", "h2", "h3", "h4", "h5", "h6", "table"]:
        for t in soup.find_all(tagname):
            t.append("\n")

    # Extract text
    text = soup.get_text(separator=" ", strip=False)

    # Normalize spaces (BUT keep line structure)
    text = re.sub(r"[ \t]+", " ", text)

    # Re-attach percentages split onto their own line
    text = re.sub(r"(?m)(\d+(?:\.\d+)?)\s*\n\s*%\s*", r"\1% ", text)

    # Collapse excessive blank lines (3+ → 2)
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)

    # Remove trailing whitespace on each line
    text = "\n".join(line.rstrip() for line in text.splitlines())

    return text.strip()


def drop_leading_xbrl_gunk(text: str) -> str:
    """
    Some newer filings have inline-XBRL / XML-ish junk at the beginning of <TEXT>.
    Cut to the first real filing header if we can find it.
    """
    markers = [
        "\nUNITED STATES\n",
        "\nSECURITIES AND EXCHANGE COMMISSION\n",
        "\nFORM 8-K\n",
        "\nFORM 10-K\n",
        "\nFORM 10-Q\n",
        "\nSCHEDULE 14A\n",
        "\nDEFINITIVE PROXY STATEMENT\n",
        "\nCURRENT REPORT\n",
    ]
    u = text.upper()
    best = None
    for m in markers:
        idx = u.find(m)
        if idx != -1:
            best = idx if best is None else min(best, idx)
    if best is not None and best > 0:
        return text[best:].lstrip()
    return text



# SEC parsing helpers

DOC_START = re.compile(r"<DOCUMENT>\s*", re.IGNORECASE)
DOC_END   = re.compile(r"</DOCUMENT>\s*", re.IGNORECASE)
TYPE_LINE = re.compile(r"<TYPE>\s*([^\s<]+)", re.IGNORECASE)

SEC_HDR_START = re.compile(r"<SEC-HEADER>", re.IGNORECASE)
SEC_HDR_END   = re.compile(r"</SEC-HEADER>", re.IGNORECASE)
SUBMISSION_TYPE_LINE = re.compile(r"CONFORMED SUBMISSION TYPE:\s*([A-Z0-9\-]+)", re.IGNORECASE)

# Junk doc types we never want in “readable text”
SKIP_TYPES = {
    "GRAPHIC", "ZIP", "EXCEL", "PDF", "JSON", "XML",
    "XBRL", "IDEA", "SCHEMA", "CAL", "DEF", "LAB", "PRE"
}

def get_primary_form_from_header(raw: str) -> str:
    m1 = SEC_HDR_START.search(raw)
    m2 = SEC_HDR_END.search(raw)
    if not m1 or not m2 or m2.start() <= m1.end():
        return ""
    hdr = raw[m1.end():m2.start()]
    m = SUBMISSION_TYPE_LINE.search(hdr)
    return (m.group(1).strip().upper() if m else "")

def is_xbrl_aux(doc_type: str) -> bool:
    t = (doc_type or "").upper().strip()
    return t.startswith("EX-101.") or t in {"EX-101", "101"}

def should_keep_doc_type(doc_type: str, primary_form: str) -> bool:
    t = (doc_type or "").upper().strip()
    p = (primary_form or "").upper().strip()

    if not t:
        return False

    # keep the primary filing document (8-K, DEFA14A, 10-K, etc.)
    if p and t == p:
        return True

    # Keep finance/legal exhibits
    if t.startswith("EX-10"): 
        return True
    if t.startswith("EX-99"): 
        return True

    return False


def clean_sec_filing(raw: str, keep_header: bool = True) -> str:
    """
    Create a human-readable text output:
    - Keep <SEC-HEADER> 
    - Keep primary <DOCUMENT> and key exhibits (EX-10.*, EX-99.*)
    - Drop GRAPHIC/base64 and XBRL auxiliary docs
    - Extract <TEXT> when present
    - Strip HTML tags when detected
    - Remove leading inline-XBRL “junk” in <TEXT> when found
    """
    primary_form = get_primary_form_from_header(raw)

    out_parts = []


    if keep_header:
        m1 = SEC_HDR_START.search(raw)
        m2 = SEC_HDR_END.search(raw)
        if m1 and m2 and m2.start() > m1.end():
            out_parts.append(raw[m1.start():m2.end()].strip())
            out_parts.append("")

    i = 0
    n = len(raw)

    while i < n:
        m = DOC_START.search(raw, i)
        if not m:
            break

        m_end = DOC_END.search(raw, m.end())
        if not m_end:
            break

        doc_text = raw[m.end():m_end.start()]  # inside <DOCUMENT>...</DOCUMENT>
        i = m_end.end()

        
        tmatch = TYPE_LINE.search(doc_text)
        doc_type = (tmatch.group(1).strip().upper() if tmatch else "")

        # Skip junk
        if doc_type in SKIP_TYPES:
            continue
        if is_xbrl_aux(doc_type):
            continue
        if not should_keep_doc_type(doc_type, primary_form):
            continue

        # Extract <TEXT> if present
        upper_doc = doc_text.upper()
        t_start = upper_doc.find("<TEXT>")
        t_end = upper_doc.rfind("</TEXT>")
        if t_start != -1 and t_end != -1 and t_end > t_start:
            body = doc_text[t_start + len("<TEXT>"):t_end]
        else:
            body = doc_text

        body = body.strip()

        # Drop leading XBRL-ish junk (works even before HTML strip)
        body = drop_leading_xbrl_gunk(body)

        # Detect HTML-ish content
        if bool(re.search(r"(?i)<\s*(html|body|div|p|br|table|tr|td|font)\b", body)) or "</" in body:
            body = strip_html_to_text(body)

        body = body.strip()
        if body:
            label = doc_type if doc_type else "UNKNOWN"
            out_parts.append(f"----- DOCUMENT TYPE: {label} -----")
            out_parts.append(body)
            out_parts.append("")

    return "\n".join(out_parts).strip()


#  Connect to WRDS
db = wrds.Connection()


#  Load SEC public paths
df = pd.read_csv("fname_list.csv")  # must contain column: path OR fname

if "path" not in df.columns and "fname" in df.columns:
    df = df.rename(columns={"fname": "path"})

if "path" not in df.columns:
    raise ValueError(f"CSV must contain column 'path' or 'fname'. Found: {list(df.columns)}")

print("Mapping SEC public paths → WRDS internal paths...\n")




# Query WRDS mappings
public_paths_tuple = tuple(df["path"].unique())

query = f"""
    SELECT fname, wrdsfname
    FROM wrdssec.wrds_forms
    WHERE fname IN {public_paths_tuple}
"""

mapping = db.raw_sql(query)
df = df.merge(mapping, how="left", left_on="path", right_on="fname")

print("Mapping completed.\n")


# Output directory (CLEAN ver ONLY)
out_dir = "/scratch/oregonstate/kuohsu/filings_text" # replace kuohsu with your own folder name

if os.path.exists(out_dir):
    print(f"Removing existing folder: {out_dir}")
    shutil.rmtree(out_dir)

os.makedirs(out_dir, exist_ok=True)
print(f"Saving CLEAN filings to: {out_dir}\n")


# Extract from WRDS filesystem → write CLEAN ONLY
print("Extracting filings from WRDS filesystem...\n")

ok = 0
no_match = 0
missing = 0
failed = 0

for _, row in df.iterrows():
    wrdsfname = row["wrdsfname"]

    if pd.isna(wrdsfname):
        print("NO MATCH:", row["path"])
        no_match += 1
        continue

    file_path = f"/wrds/sec/warchives/{wrdsfname}"

    if not os.path.exists(file_path):
        print("MISSING:", file_path)
        missing += 1
        continue

    try:
        with open(file_path, "r", errors="ignore") as f:
            raw = f.read()

        clean_text = clean_sec_filing(raw, keep_header=True)

        # Output name: original basename + _clean.txt
        base = os.path.basename(wrdsfname)
        if base.lower().endswith(".txt"):
            out_name = base.replace(".txt", "_clean.txt")
        else:
            out_name = base + "_clean.txt"

        out_path = os.path.join(out_dir, out_name)

        with open(out_path, "w", errors="ignore") as out:
            out.write(clean_text)

        ok += 1  # silent success

    except Exception as e:
        print("FAILED:", file_path, e)
        failed += 1


# Summary
print("\nDone!")
print(f"OK: {ok}")
print(f"NO MATCH: {no_match}")
print(f"MISSING INPUT FILE: {missing}")
print(f"FAILED: {failed}")
print(f"All CLEAN files saved under: {out_dir}")



# Create tar.gz archive
try:
    files = [
        f for f in os.listdir(out_dir)
        if os.path.isfile(os.path.join(out_dir, f))
    ]
    file_count = len(files)
except Exception as e:
    print("ERROR listing output directory:", e)
    file_count = 0

print(f"\nOutput file count (clean only): {file_count}")

if file_count == 0:
    print("ERROR: Output folder is empty. Archive not created.")
else:
    parent_dir = os.path.dirname(out_dir)              # /scratch/oregonstate/kuohsu
    folder_name = os.path.basename(out_dir)            # filings_text
    base_name = os.path.join(parent_dir, folder_name)  # /scratch/.../filings_text

    tar_gz_path = base_name + ".tar.gz"

    if os.path.exists(tar_gz_path):
        print(f"Removing existing archive: {tar_gz_path}")
        os.remove(tar_gz_path)

    print(f"Creating archive: {tar_gz_path}")
    shutil.make_archive(
        base_name,
        format="gztar",
        root_dir=parent_dir,
        base_dir=folder_name
    )

    size_mb = os.path.getsize(tar_gz_path) / (1024 * 1024)
    print(f"Archive created ({size_mb:.2f} MB)")



# final step: compressed file download
# Create ZIP files
#   - Single:  filings_text.zip
#   - Split:   filings_text.z01, filings_text.z02, ... , filings_text.zip
#   - Extract on Windows: keep all parts in same folder, open filings_text.zip
if file_count != 0:
    PART_MB = 3000  # 3GB per part

    parent_dir = os.path.dirname(out_dir)
    folder_name = os.path.basename(out_dir)  # filings_text
    zip_base = os.path.join(parent_dir, folder_name)  # /scratch/.../filings_text
    zip_path = zip_base + ".zip"                      # /scratch/.../filings_text.zip

    # check zip exists
    if shutil.which("zip") is None:
        print("\nERROR: 'zip' command not found on WRDS.")
        print("Fallback: use tar.gz + .part001 splitting (your old method).")
        raise RuntimeError("zip not installed")

    # remove old split-zip outputs
    for old in glob.glob(zip_base + ".z[0-9][0-9]") + glob.glob(zip_path):
        try:
            os.remove(old)
        except Exception:
            pass

    print(f"\nCreating split ZIP (~{PART_MB} MB per part if needed)...")


    
    cmd = ["zip", "-r", f"-s{PART_MB}m", zip_path, folder_name]
    
    subprocess.run(
        cmd,
        cwd=parent_dir,
        check=True,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )
    
    print("\nZIP creation finished.")

    zip_parts = sorted(glob.glob(zip_base + ".z[0-9][0-9]"))
    all_zip_files = zip_parts + ([zip_path] if os.path.exists(zip_path) else [])

    print(f"Created {len(all_zip_files)} ZIP file(s):")
    for p in all_zip_files:
        print(" -", p)

    # stage to home one by one
    home_dir = os.path.expanduser("~")

    for p in all_zip_files:
        name = os.path.basename(p)
        home_p = os.path.join(home_dir, name)

        if os.path.exists(home_p):
            try:
                os.remove(home_p)
            except Exception:
                pass

        print(f"\nCopying {name} to home: {home_p}")
        shutil.copy2(p, home_p)

        size_mb = os.path.getsize(home_p) / (1024 * 1024)
        print(f" Staged {name} ({size_mb:.2f} MB) in home directory.")
        print(" Download it now (Jupyter UI download or scp from your local machine).")
        print(" IMPORTANT: AFTER downloading to your local device, delete it from home to free the quota.")

        # user instructions
        if len(all_zip_files) == 1 and name.endswith(".zip"):
            print("\nNOTE: Single ZIP file (not split). You can extract directly in WinRAR/7-Zip.")
            
        else:
            print("\nNOTE: Split ZIP set for WinRAR/7-Zip:")
            print(" - Download ALL .z01/.z02/... AND the .zip into the SAME folder.")
            print(" - Open ONLY the .zip file (e.g., filings_text.zip) to extract.")
            print("   (Do NOT open .z01/.z02 directly.)")

        while True:
            ans = input("Type 'ok' after you downloaded AND deleted the file from home: ").strip().lower()
            if ans == "ok":
                break
            print("Please type exactly 'ok' to continue.")
        
        print("\nTask Complete.")


Loading library list...
Done
Mapping SEC public paths → WRDS internal paths...

Mapping completed.

Removing existing folder: /scratch/oregonstate/kuohsu/filings_text
Saving CLEAN filings to: /scratch/oregonstate/kuohsu/filings_text

Extracting filings from WRDS filesystem...


Done!
OK: 32784
NO MATCH: 0
MISSING INPUT FILE: 0
FAILED: 0
All CLEAN files saved under: /scratch/oregonstate/kuohsu/filings_text

Output file count (clean only): 32784
Removing existing archive: /scratch/oregonstate/kuohsu/filings_text.tar.gz
Creating archive: /scratch/oregonstate/kuohsu/filings_text.tar.gz
Archive created (343.77 MB)

Creating split ZIP (~3000 MB per part if needed)...
ZIP creation finished.
Created 1 ZIP file(s):
 - /scratch/oregonstate/kuohsu/filings_text.zip

Copying filings_text.zip to home: /home/oregonstate/kuohsu/filings_text.zip
 Staged filings_text.zip (393.08 MB) in home directory.
 Download it now (Jupyter UI download or scp from your local machine).
 IMPORTANT: AFTER downloading, d