In [1]:
import os
from utils.pre_processor import processor_use_lemma_plus as processor
from utils.const.stopwords import STOPWORDS
from gensim.models import LdaModel
from gensim import corpora
import gensim
from utils.const import paths
from nltk.tokenize import word_tokenize
from utils.const.stopwords import html_stop_words

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import sys
from typing import List

In [2]:
def split_documents(documents: List[str]) -> List[List[str]]:
    # 分割每个文档中的单词，并将结果存储在列表中
    return [document.split() for document in documents]


def read_documents_from_folder(folder_path: str, stop_words=None) -> List[str]:
    print("Reading documents")
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            print("Reading year: {}".format(file_path))
            with open(file_path, 'r') as file:
                lines = file.readlines()
                # cleaned_text = processor(lines, STOPWORDS)
                cleaned_texts = []
                for line in lines:
                    words = word_tokenize(line)
                    cleaned_line = ' '.join(
                        [word for word in words if word.lower() not in stop_words])
                    cleaned_texts.append(cleaned_line)
                documents.extend(cleaned_texts)
    return documents


def find_lda(texts: List[str], n_topics: int = 20, save: bool = True, save_path: str = None) -> None:
    print("Finding LDA")
    texts = split_documents(texts)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    ldamodel = LdaModel(corpus, num_topics=n_topics,
                        id2word=dictionary, passes=10)
    
    if save:
        ldamodel.save(save_path)
    topics = ldamodel.print_topics(num_words=10)
    for topic in topics:
        print(topic)
    
    return ldamodel

In [3]:
total_folder_path = paths.all_data
year_folder = os.listdir(total_folder_path)
total_documents = []
for year in year_folder:
    folder_path = os.path.join(total_folder_path, year)
    documents = read_documents_from_folder(folder_path, html_stop_words)
    total_documents.extend(documents)
lda_model = find_lda(total_documents, 20, save=True,
            save_path=paths.lda_model_save_path)

In [4]:
# Test code
# file_path = "/Users/weichentao/Documents/USC/2023fall/540/project/select_valuable/valuable/cleaned/2017/1750_000104746917004528_a2232622z10-k.htm.txt"
# with open(file_path, 'r') as file:
#     lines = file.readlines()
#     cleaned_text = processor(lines, STOPWORDS)
#     lda_model = find_lda(cleaned_text, 20, save=True,
#                 save_path=paths.lda_model_save_path)

(0, '0.057*"value" + 0.030*"carry" + 0.026*"impairment" + 0.024*"asset" + 0.022*"unit" + 0.020*"charge" + 0.019*"service" + 0.019*"complete" + 0.019*"report" + 0.016*"goodwill"')
(1, '0.047*"service" + 0.020*"customer" + 0.014*"inventory" + 0.014*"delivery" + 0.014*"condition" + 0.013*"support" + 0.009*"operation" + 0.007*"reduction" + 0.007*"future" + 0.007*"repair"')
(2, '0.043*"may" + 0.027*"market" + 0.025*"capital" + 0.023*"credit" + 0.021*"significant" + 0.015*"company" + 0.015*"risk" + 0.012*"control" + 0.012*"include" + 0.012*"defense"')
(3, '0.033*"customer" + 0.026*"nbsp" + 0.023*"lease" + 0.019*"term" + 0.018*"certain" + 0.017*"include" + 0.017*"defense" + 0.016*"future" + 0.016*"service" + 0.015*"product"')
(4, '0.033*"business" + 0.028*"aircraft" + 0.027*"service" + 0.023*"product" + 0.023*"also" + 0.016*"segment" + 0.015*"sale" + 0.015*"nbsp" + 0.015*"fiscal" + 0.014*"cost"')
(5, '0.029*"service" + 0.025*"also" + 0.023*"system" + 0.020*"provide" + 0.017*"design" + 0.015*"