In [None]:
"""
SEC Topic Modeling (BERTopic) â€” single-file script that runs in Jupyter.

How to use in a Jupyter notebook:
1) Put this file in the same folder as your notebook (e.g., sec_bertopic_pipeline.py)
2) In a notebook cell run:
   %run sec_bertopic_pipeline.py

Dependencies (run once):
   !pip install bertopic[all] sentence-transformers pandas

What it does:
- Loads SEC .txt filings (EDGAR full submission format)
- Extracts <DOCUMENT> blocks and their <TEXT> sections
- Cleans + chunks the narrative text
- Runs BERTopic and forces EXACTLY 10 topics (by reduce_topics)
- Prints demo-ready topic summaries and writes CSV outputs
"""

from __future__ import annotations

import re
import html
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple

import pandas as pd

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer



FILES = [
    "/Users/rithikreddynibbaragandlla/PycharmProjects/JupyterProject/0001193125-19-235916.txt"
]

# If these doc TYPES exist in the filing, we prioritize them; otherwise we fall back to all types.
PREFERRED_TYPES = {"10-K", "10-Q", "8-K", "S-1", "S-3", "S-4", "424B5", "424B2", "424B3", "DEF 14A"}

# Chunking behavior
MIN_CHUNK_CHARS = 450
TARGET_CHUNK_CHARS = 1100
MAX_CHUNK_CHARS = 1600

# Topic modeling
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
TARGET_TOPICS = 10

# Output files (written to current working directory)
OUT_TOPICS_CSV = "sec_topics_bertopic.csv"
OUT_DOCS_CSV = "sec_docs_with_topics.csv"


BASE_STOPWORDS = {
    "a","about","above","after","again","against","all","am","an","and","any","are",
    "aren't","as","at","be","because","been","before","being","below","between","both",
    "but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't",
    "doing","don't","down","during","each","few","for","from","further","had","hadn't",
    "has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here",
    "here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll",
    "i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me",
    "more","most","mustn't","my","myself","no","nor","not","of","off","on","once",
    "only","or","other","ought","our","ours","ourselves","out","over","own","same",
    "shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such",
    "than","that","that's","the","their","theirs","them","themselves","then","there",
    "there's","these","they","they'd","they'll","they're","they've","this","those",
    "through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll",
    "we're","we've","were","weren't","what","what's","when","when's","where","where's",
    "which","while","who","who's","whom","why","why's","with","won't","would","wouldn't",
    "you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"
}

# Finance-specific stopwords added on top
FIN_EXTRA = {
    "section","herein","hereby","thereof","exhibit","prospectus","form",
    "llc","inc","corporation","company","issuer","tender","exchange",
    "may","shall","pursuant","filing","statement","table","contents",
    "security","securities","notes","debentures","shares","rate","interest",
    "benchmark","maturity","coupon","floating","fixed","redemption",
    "rule","holders","offering","registrant","corporate","document"
}

FIN_STOPWORDS = BASE_STOPWORDS | FIN_EXTRA


def preprocess_text(text: str) -> str:
    text = strip_html_tags_keep_text(text)
    text = normalize_boilerplate(text)

    # remove numbers, CUSIP codes, % signs, $ amounts etc.
    text = re.sub(r"\$?\d[\d,\.%]*", " ", text)

    tokens = re.findall(r"[A-Za-z]+", text.lower())
    tokens = [t for t in tokens if t not in FIN_STOPWORDS and len(t) > 3]

    return " ".join(tokens)

@dataclass
class SecDocBlock:
    source_file: str
    doc_type: str
    filename: str
    raw_text: str


_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
_DOC_BLOCK_RE = re.compile(r"<DOCUMENT>(.*?)</DOCUMENT>", re.IGNORECASE | re.DOTALL)


def _read_text(path: Path) -> str:
    # EDGAR can be messy; ignore decode errors rather than fail
    return path.read_text(errors="ignore")


def _extract_tag_value(text: str, tag: str) -> Optional[str]:
    # Match <TAG>value up to end of line or next tag
    m = re.search(rf"<{tag}>\s*([^\n<]+)", text, flags=re.IGNORECASE)
    return m.group(1).strip() if m else None


def parse_edgar_submission(file_path: str) -> List[SecDocBlock]:
    """
    Parse a full EDGAR submission .txt and return list of document blocks.
    Each block tries to capture <TYPE>, <FILENAME>, <TEXT>.
    """
    path = Path(file_path)
    full_text = _read_text(path)

    blocks: List[SecDocBlock] = []
    for doc in _DOC_BLOCK_RE.findall(full_text):
        doc_type = _extract_tag_value(doc, "TYPE") or "UNKNOWN"
        fname = _extract_tag_value(doc, "FILENAME") or ""
        m_text = re.search(r"<TEXT>(.*?)</TEXT>", doc, flags=re.IGNORECASE | re.DOTALL)
        if not m_text:
            continue
        raw_text = m_text.group(1)
        blocks.append(SecDocBlock(source_file=path.name, doc_type=doc_type, filename=fname, raw_text=raw_text))

    return blocks


def strip_html_tags_keep_text(text: str) -> str:
    """
    Remove HTML-ish tags and decode entities. Many SEC <TEXT> blocks are HTML.
    """
    # Some documents embed HTML comments / scripts; drop common noisy things quickly.
    text = re.sub(r"(?is)<script.*?>.*?</script>", " ", text)
    text = re.sub(r"(?is)<style.*?>.*?</style>", " ", text)

    text = _TAG_RE.sub(" ", text)
    text = html.unescape(text)
    text = text.replace("\x00", " ")
    text = _WS_RE.sub(" ", text).strip()
    return text


def looks_like_table_or_numbers(text: str) -> bool:
    """
    Heuristic to skip chunks that are mostly numeric/table-like.
    """
    if len(text) < 200:
        return True
    digits = sum(ch.isdigit() for ch in text)
    ratio = digits / max(1, len(text))
    # If it's heavily numeric, likely a table/fn that harms topics.
    return ratio > 0.18


def normalize_boilerplate(text: str) -> str:
    """
    Light boilerplate cleanup.
    We DON'T remove finance words; we just reduce repeated "Table of Contents" and weird page markers.
    """
    text = re.sub(r"(?i)\btable of contents\b", " ", text)
    text = re.sub(r"\bS-\d+\b", " ", text)     # common page footer
    text = re.sub(r"\bPage\s+\d+\b", " ", text)
    text = _WS_RE.sub(" ", text).strip()
    return text


_SENT_SPLIT_RE = re.compile(r"(?<=[\.\!\?])\s+")


def chunk_text(text: str,
               min_chars: int = MIN_CHUNK_CHARS,
               target_chars: int = TARGET_CHUNK_CHARS,
               max_chars: int = MAX_CHUNK_CHARS) -> List[str]:
    """
    Sentence-ish chunking:
    - Split to sentences
    - Accumulate into ~target_chars chunks
    - Enforce min/max lengths
    """
    sentences = _SENT_SPLIT_RE.split(text)
    chunks: List[str] = []
    buf: List[str] = []
    buf_len = 0

    def flush():
        nonlocal buf, buf_len
        if not buf:
            return
        chunk = " ".join(buf).strip()
        buf = []
        buf_len = 0
        if len(chunk) >= min_chars:
            chunks.append(chunk)

    for s in sentences:
        s = s.strip()
        if not s:
            continue

        # If one sentence is enormous, hard split it.
        if len(s) > max_chars:
            # flush current buffer first
            flush()
            for i in range(0, len(s), max_chars):
                part = s[i:i + max_chars].strip()
                if len(part) >= min_chars:
                    chunks.append(part)
            continue

        if buf_len + len(s) + 1 <= target_chars:
            buf.append(s)
            buf_len += len(s) + 1
        else:
            flush()
            buf.append(s)
            buf_len = len(s) + 1

    flush()
    return chunks


def build_corpus(files: List[str],
                 preferred_types: set[str] = PREFERRED_TYPES) -> Tuple[List[str], List[Dict]]:
    """
    Returns:
      docs: list[str] chunks for BERTopic
      meta: list[dict] per chunk with provenance
    """
    all_blocks: List[SecDocBlock] = []
    for fp in files:
        all_blocks.extend(parse_edgar_submission(fp))

    # If any preferred types exist, keep only those; otherwise keep everything
    existing_types = {b.doc_type for b in all_blocks}
    use_types = preferred_types & existing_types
    filtered = [b for b in all_blocks if (b.doc_type in use_types)] if use_types else all_blocks

    docs: List[str] = []
    meta: List[Dict] = []

    for b in filtered:
        cleaned = preprocess_text(b.raw_text)
        cleaned = normalize_boilerplate(cleaned)
        if len(cleaned) < 1000:
            continue

        chunks = chunk_text(cleaned)
        for idx, ch in enumerate(chunks):
            if looks_like_table_or_numbers(ch):
                continue
            docs.append(ch)
            meta.append({
                "source_file": b.source_file,
                "doc_type": b.doc_type,
                "doc_filename": b.filename,
                "chunk_id": idx,
                "chunk_chars": len(ch),
            })

    return docs, meta


#Start of BERTopic

def fit_bertopic_force_10_topics(docs: List[str]):
    """
    Fit BERTopic and force exactly TARGET_TOPICS using updated API signature.
    """

    if len(docs) < 20:
        raise ValueError(f"Not enough text chunks: {len(docs)} detected.")

    embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

    # First run: loose clustering (min_topic_size small so topics form)
    topic_model = BERTopic(
        embedding_model=embedding_model,
        min_topic_size=5,
        verbose=True
    )

    topics, probs = topic_model.fit_transform(docs)

    # Force exactly N topics
    topic_model = topic_model.reduce_topics(docs, nr_topics=TARGET_TOPICS)

    # Recompute assignments after reduction
    topics, probs = topic_model.transform(docs)

    return topic_model, topics, probs



def topic_words(topic_model: BERTopic, topic_id: int, top_n: int = 10) -> List[str]:
    pairs = topic_model.get_topic(topic_id) or []
    return [w for (w, _) in pairs[:top_n]]


def main():
    # --- Build corpus
    docs, meta = build_corpus(FILES, PREFERRED_TYPES)
    print(f"Chunks produced for modeling: {len(docs)}")
    if len(docs) == 0:
        raise RuntimeError("No usable text chunks were extracted. "
                           "Try lowering MIN_CHUNK_CHARS, or check that the filings are valid EDGAR .txt files.")

    # --- Fit topic model
    topic_model, topics, probs = fit_bertopic_force_10_topics(docs)

    # --- Topic summary (exclude outlier topic -1 if it exists)
    info = topic_model.get_topic_info()
    info_non_outlier = info[info["Topic"] != -1].copy()

    # If reduce_topics worked, we should have exactly TARGET_TOPICS here (sometimes topic -1 exists in addition).
    top_topics = info_non_outlier.head(TARGET_TOPICS)

    rows = []
    for _, r in top_topics.iterrows():
        tid = int(r["Topic"])
        rows.append({
            "topic_id": tid,
            "count": int(r["Count"]),
            "auto_name": r["Name"],
            "top_words": ", ".join(topic_words(topic_model, tid, top_n=12)),
        })

    topics_df = pd.DataFrame(rows).sort_values(["count"], ascending=False)
    print("\n=== Demo-ready topic list (10 topics) ===")
    for _, r in topics_df.iterrows():
        print(f"\nTopic {r['topic_id']} | chunks={r['count']}")
        print(f"Label: {r['auto_name']}")
        print(f"Top words: {r['top_words']}")


    def format_topic_output(topic_model, n_words=20):
        """
        Format BERTopic topics to look like LDA:
        (topic_id, '0.027*"word1" + 0.023*"word2" + ...')
        """
        topics = topic_model.get_topics()
        formatted = []

        for topic_id, word_scores in topics.items():
            if topic_id == -1:  # Skip outlier topic
                continue

            # Format: 0.027*"word"
            weighted_terms = [
                f'{round(score, 3)}*"{word}"'
                for word, score in word_scores[:n_words]
            ]

            formatted_str = " + ".join(weighted_terms)
            formatted.append((topic_id, formatted_str))

        return formatted

        # ---- RUN IT ----
    formatted_topics = format_topic_output(topic_model, n_words=20)

    for topic in formatted_topics:
        print(topic)





# Run when executed via %run in Jupyter
if __name__ == "__main__":
    main()
