In [3]:
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np


# 1. Load your financial text
filename = "0000320187-25-000053.txt"
with open(filename, "r", encoding="utf-8") as f:
    text_data = f.read()



# 2. Basic text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()
    return " ".join(tokens)


cleaned_text = clean_text(text_data)



# 3. Load your financial keywords (from exmaples file: topics_v1, topics_v2)
def load_topic_words(path):
    words = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                words.append(line.lower())
    return words

topic_v1 = load_topic_words("topics_v1.txt")
topic_v2 = load_topic_words("topics_v2.txt")
financial_seed_words = list(set(topic_v1 + topic_v2))

print(f"Loaded {len(financial_seed_words)} financial keywords.")


# =========================================
# 4. Vectorize the document
# =========================================
vectorizer = CountVectorizer(
    max_df=1.0,
    min_df=1,
    stop_words="english",
)

doc_matrix = vectorizer.fit_transform([cleaned_text])



# 5. Train small LDA model
n_topics = 8  # change as needed

lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=20,
    learning_method="online",
    random_state=42
)

lda_model.fit(doc_matrix)

feature_names = vectorizer.get_feature_names_out()



# 6. Print topics (similar to gensim output)
def print_topics(model, feature_names, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"\nTopic {idx}:")
        top_indices = topic.argsort()[-n_top_words:][::-1]
        for i in top_indices:
            weight = topic[i] / topic.sum()
            print(f"  {weight:.3f} * '{feature_names[i]}'")


print_topics(lda_model, feature_names)


Loaded 9 financial keywords.

Topic 0:
  0.003 * 'type'
  0.003 * 'entityfilenumber'
  0.003 * 'prefix'
  0.003 * 'nke'
  0.003 * 'dei'
  0.003 * 'sec'
  0.003 * 'sequence'
  0.003 * 'conformed'
  0.003 * 'pre'
  0.003 * 'xbrl'

Topic 1:
  0.003 * 'type'
  0.003 * 'balance'
  0.003 * 'dei'
  0.003 * 'securities'
  0.003 * 'number'
  0.003 * 'role'
  0.003 * 'xbrli'
  0.003 * 'film'
  0.003 * 'data'
  0.003 * 'htm'

Topic 2:
  0.003 * 'type'
  0.003 * 'details'
  0.003 * 'period'
  0.003 * 'false'
  0.003 * 'end'
  0.003 * 'beaverton'
  0.003 * 'report'
  0.003 * 'dei'
  0.003 * 'extension'
  0.003 * 'officer'

Topic 3:
  0.003 * 'voluntarily'
  0.003 * 'entityaddressstateorprovince'
  0.003 * 'title'
  0.003 * 'irs'
  0.003 * 'footwear'
  0.003 * 'assigned'
  0.003 * 'tradingsymbolitemtype'
  0.003 * 'limited'
  0.003 * 'available'
  0.003 * 'identification'

Topic 4:
  0.052 * 'type'
  0.033 * 'dei'
  0.027 * 'document'
  0.021 * 'definition'
  0.019 * 'entity'
  0.017 * 'number'
  0.