In [None]:
import re
import random
from collections import Counter, defaultdict
import math

# ----------------------------
# 1) Offline cleaning utilities
# ----------------------------

def preprocess_text(text):
    """Basic lowercase + non-letter removal. Offline and lightweight on purpose.
    Design note: We can swap this out for something NLTK/spaCy-based in the future."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = text.split()
    return tokens

# Base stopword list (general English).
simple_stopwords = set("""
a an the and or if in on at to for of this that is was were be been it with as by from into up out 
do does did doing have has had not so such these those he she they them we you i but which will would
should can could may might also about over under between during per each eachother
""".split())

# Design: domain-specific stopwords for SEC / financial disclosures.
# Extend this set as you learn more about the structure & syntax of your filings.
domain_stopwords = set([
    # Examples / placeholders â€“ add real disclosure-specific terms here over time:
    "inc", "ltd", "corp", "company", "llc", "plc",
    "form", "sec", "statement"
])

# Merge base + domain-specific stopwords
simple_stopwords |= domain_stopwords

def remove_stopwords(tokens):
    return [t for t in tokens if t not in simple_stopwords]

def perform_lemmatization(tokens):
    """Very simple rule-based lemmatization.
    Design note: This is intentionally lightweight and offline.
    In the future, this function is the hook where we could plug in NLTK/spaCy
    lemmatization for better accuracy/efficiency."""
    lemmas = []
    for t in tokens:
        if t.endswith("ing") and len(t) > 4:
            lemmas.append(t[:-3])
        elif t.endswith("ed") and len(t) > 3:
            lemmas.append(t[:-2])
        elif t.endswith("s") and len(t) > 3:
            lemmas.append(t[:-1])
        else:
            lemmas.append(t)
    return lemmas

def clean_text(text):
    tokens = preprocess_text(text)
    tokens = remove_stopwords(tokens)
    tokens = perform_lemmatization(tokens)
    return tokens

# ----------------------------
# 2) Load file (change filename if needed)
# ----------------------------
FILENAME = "Disney.txt"  # change if your file has another name

with open(FILENAME, "r", encoding="utf-8") as f:
    raw = f.read()

tokens = clean_text(raw)

# Basic diagnostics
print(f"Total tokens after cleaning: {len(tokens)}")
print("Sample tokens:", tokens[:40])

if not tokens:
    print("No tokens after cleaning; nothing to model.")
    raise SystemExit

# Frequency and ASCII word cloud (top 30)
freq = Counter(tokens)
most_common_30 = freq.most_common(30)

print("\nTop 30 most common words:")
for w, c in most_common_30:
    print(f"{w}: {c}")

print("\nASCII Word Cloud:")
max_width = 50
for word, count in most_common_30:
    bar = "#" * min(count, max_width)
    print(f"{word:15} {bar}")

# ----------------------------
# 3) Build pseudo-documents
#    Design/correctness: Only chunk large disclosures; small ones stay as a single doc.
# ----------------------------
TOTAL_TOKENS = len(tokens)

# "Target" chunk size for large disclosures
DOC_CHUNK_SIZE = 200

# Threshold: below this, treat as one document instead of forcing multiple small chunks
MIN_TOKENS_FOR_MULTI_DOC = 800  # design parameter you can tune

if TOTAL_TOKENS == 0:
    documents = []
elif TOTAL_TOKENS < MIN_TOKENS_FOR_MULTI_DOC:
    # Small/medium disclosure: keep as a single document
    documents = [tokens]
else:
    # Large disclosure: chunk into pseudo-documents
    documents = []
    for i in range(0, TOTAL_TOKENS, DOC_CHUNK_SIZE):
        documents.append(tokens[i: i + DOC_CHUNK_SIZE])

    # If we still ended up with very few docs, slightly reduce chunk size
    # to get a bit more doc-level variation for LDA.
    if len(documents) < 5:
        DOC_CHUNK_SIZE = max(50, TOTAL_TOKENS // 8)
        documents = [tokens[i: i + DOC_CHUNK_SIZE]
                     for i in range(0, TOTAL_TOKENS, DOC_CHUNK_SIZE)]

num_docs = len(documents)
print(f"\nBuilt {num_docs} pseudo-documents (effective chunk size ~{DOC_CHUNK_SIZE}).")

# ----------------------------
# 4) Prepare vocabulary and ids
# ----------------------------
vocab = sorted(list(set(tokens)))
V = len(vocab)
word2id = {w: i for i, w in enumerate(vocab)}
id2word = {i: w for w, i in word2id.items()}

# Convert documents to id-lists
docs_wids = [[word2id[w] for w in doc] for doc in documents]

# ----------------------------
# 5) Simple collapsed Gibbs sampler for LDA
# ----------------------------
NUM_TOPICS = 6         # change number of topics
ALPHA = 0.1            # doc-topic prior
BETA = 0.01            # topic-word prior
ITERATIONS = 400       # sampling iterations (increase for better convergence)
TOP_N_WORDS = 15       # words to print per topic

random.seed(0)

D = len(docs_wids)
K = NUM_TOPICS

# counts
doc_topic_counts = [defaultdict(int) for _ in range(D)]
topic_word_counts = [defaultdict(int) for _ in range(K)]
topic_counts = [0 for _ in range(K)]
doc_lengths = [len(doc) for doc in docs_wids]

# initial topic assignments (random)
topic_assignments = []
for d, doc in enumerate(docs_wids):
    current_doc_topics = []
    for w in doc:
        z = random.randrange(K)
        current_doc_topics.append(z)
        doc_topic_counts[d][z] += 1
        topic_word_counts[z][w] += 1
        topic_counts[z] += 1
    topic_assignments.append(current_doc_topics)

# Precompute constants
V_beta = V * BETA

def sample_topic(d, w, current_z):
    # remove current assignment
    doc_topic_counts[d][current_z] -= 1
    if doc_topic_counts[d][current_z] == 0:
        del doc_topic_counts[d][current_z]
    topic_word_counts[current_z][w] -= 1
    if topic_word_counts[current_z][w] == 0:
        del topic_word_counts[current_z][w]
    topic_counts[current_z] -= 1

    # compute full conditional for each topic
    probs = []
    for k in range(K):
        left = (doc_topic_counts[d].get(k, 0) + ALPHA) / (doc_lengths[d] - 1 + K * ALPHA)
        right = (topic_word_counts[k].get(w, 0) + BETA) / (topic_counts[k] + V_beta)
        probs.append(left * right)

    # normalize
    s = sum(probs)
    if s == 0:
        probs = [1.0 / K] * K
    else:
        probs = [p / s for p in probs]

    # draw new topic
    r = random.random()
    cum = 0.0
    new_z = 0
    for k, p in enumerate(probs):
        cum += p
        if r <= cum:
            new_z = k
            break

    # add back new assignment
    doc_topic_counts[d][new_z] += 1
    topic_word_counts[new_z][w] += 1
    topic_counts[new_z] += 1

    return new_z

# Gibbs sampling iterations
print("\nRunning Gibbs sampling...")
for it in range(1, ITERATIONS + 1):
    for d, doc in enumerate(docs_wids):
        for i, w in enumerate(doc):
            current_z = topic_assignments[d][i]
            new_z = sample_topic(d, w, current_z)
            topic_assignments[d][i] = new_z

    if it % 100 == 0 or it == 1:
        print(f"Iteration {it}/{ITERATIONS}")

print("Gibbs sampling finished.")

# -----------------------------
# 6) Compute topic-word distributions (phi) and print topics.
# ----------------------------
phi = []
for k in range(K):
    phi_k = {}
    denom = topic_counts[k] + V_beta
    if denom == 0:
        phi.append(phi_k)
        continue
    for w_id in topic_word_counts[k]:
        phi_k[w_id] = (topic_word_counts[k].get(w_id, 0) + BETA) / denom
    phi.append(phi_k)

print("\nTopics (top words with weights):")
for k in range(K):
    if not phi[k]:
        print(f"\nTopic {k+1}: (empty)")
        continue

    word_probs = [(id2word[w_id], prob) for w_id, prob in phi[k].items()]
    word_probs.sort(key=lambda x: x[1], reverse=True)
    top = word_probs[:TOP_N_WORDS]

    print(f"\nTopic {k+1}:")
    for word, prob in top:
        print(f"{prob:.3f} {word}")

# ----------------------------
# 7) Prompt-friendly topic output for AI usage
# ----------------------------
print("\nTopics formatted for prompt input (comma-separated keywords):")
for k in range(K):
    if not phi[k]:
        print(f"Topic {k+1}: (empty)")
        continue
    top_words = sorted(phi[k].items(), key=lambda x: x[1], reverse=True)[:8]
    words_only = [id2word[w_id] for w_id, _ in top_words]
    print(f"Topic {k+1}: {', '.join(words_only)}")


Total tokens after cleaning: 38540
Sample tokens: ['document', 'txt', 'header', 'hdr', 'sgml', 'acceptance', 'datetime', 'accession', 'number', 'conform', 'submission', 'type', 's', 'public', 'document', 'count', 'fil', 'date', 'date', 'change', 'filer', 'data', 'conform', 'name', 'walt', 'disney', 'co', 'central', 'index', 'key', 'standard', 'industrial', 'classification', 'service', 'miscellaneou', 'amusement', 'recreation', 'irs', 'number', 'state']

Top 30 most common words:
note: 2916
exchange: 1205
due: 1064
any: 777
date: 609
amount: 586
original: 570
principal: 567
interest: 512
securitie: 489
offer: 380
act: 317
payment: 306
aggregate: 304
holder: 301
register: 292
other: 285
are: 257
disney: 240
indenture: 222
all: 208
registration: 201
serie: 196
tender: 195
redemption: 194
its: 193
prospectu: 181
transfer: 181
includ: 176
s: 164

ASCII Word Cloud:
note            ##################################################
exchange        #############################################