In [7]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# ============================================================
# 1. Load the WRDS combined text file
# ============================================================

FILENAME = "wrds_text_data.txt"   # <-- this is your file!

with open(FILENAME, "r", errors="ignore") as f:
    text_data = f.read()

print("Loaded:", FILENAME)
print("Length:", len(text_data), "characters")


# ============================================================
# 2. Clean the full text
# ============================================================

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

cleaned_text = clean_text(text_data)


# ============================================================
# 3. Vectorize ONE giant document
# ============================================================

vectorizer = CountVectorizer(
    max_df=1.0,
    min_df=1,               # important for 1-document LDA
    stop_words="english",
)

doc_matrix = vectorizer.fit_transform([cleaned_text])

print("Vocabulary size:", len(vectorizer.get_feature_names_out()))


# ============================================================
# 4. Fit LDA model
# ============================================================

N_TOPICS = 10

lda_model = LatentDirichletAllocation(
    n_components=N_TOPICS,
    max_iter=20,
    learning_method="online",
    random_state=42
)

lda_model.fit(doc_matrix)

feature_names = vectorizer.get_feature_names_out()


# ============================================================
# 5. Print topics
# ============================================================

def print_topics(model, feature_names, n_top_words=15):
    for idx, topic in enumerate(model.components_):
        print(f"\n===== Topic {idx} =====")
        top_indices = topic.argsort()[-n_top_words:][::-1]
        for i in top_indices:
            print(feature_names[i], end=", ")
        print("\n")

print_topics(lda_model, feature_names)


Loaded: wrds_text_data.txt
Length: 35917232 characters
Vocabulary size: 27176

===== Topic 0 =====
font, td, style, px, pt, nbsp, size, div, family, valign, align, padding, roman, margin, times, 


===== Topic 1 =====
font, td, style, size, pt, nbsp, family, px, div, valign, padding, align, left, text, roman, 


===== Topic 2 =====
enormous, distributions, affecting, convenience, workiva, style, lelewqzykmmeo, flj, lrpjq, oss, eiq, avoidability, fbr, usxl, family, 


===== Topic 3 =====
font, td, style, pt, size, px, nbsp, valign, family, padding, align, div, times, margin, new, 


===== Topic 4 =====
font, td, size, style, pt, nbsp, valign, px, div, family, padding, align, new, roman, tr, 


===== Topic 5 =====
font, td, style, pt, px, size, family, valign, nbsp, div, padding, align, roman, new, text, 


===== Topic 6 =====
td, font, size, style, px, nbsp, pt, align, valign, div, family, padding, times, left, text, 


===== Topic 7 =====
font, td, pt, size, style, nbsp, px, family, va