In [82]:
import glob
import os

file_list = glob.glob(os.path.join(os.getcwd(), "Full_Text", "*.txt"))

corpus2015 = []
corpus2016 = []
corpus2017 = []
corpus2018 = []
corpus2019 = []


for file_path in file_list:
    with open(file_path, encoding="utf8") as f_input:
        if "2015" in file_path:
            corpus2015.append(f_input.read())
        elif "2016" in file_path:
            corpus2016.append(f_input.read())
        elif "2017" in file_path:
            corpus2017.append(f_input.read())  
        elif "2018" in file_path:
            corpus2018.append(f_input.read())
        elif "2019" in file_path:
            corpus2019.append(f_input.read())
            
len(corpus2019)

21

In [85]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def clean_text(corpus):

    cleaned_text = []

    for item in corpus:
        item.lower()
        text_nonum = re.sub(r'\d+','', item)

        text_nopunc = text_nonum.translate(str.maketrans("","", string.punctuation))
        text_nowhitespace = text_nopunc.strip()

        tokens = word_tokenize(text_nowhitespace)

        stop_words = set(stopwords.words('english'))
        text_nostopwords = [i for i in tokens if not i in stop_words]
        
        pos_tagged = nltk.pos_tag(text_nostopwords)
        pos_keep = ["NN", "NNS","VB", "VBD", "VBN", "VBP", "VBZ"]
        pos_cleaned = [tup[0] for tup in pos_tagged if tup[1] in pos_keep]

        text_str = " ".join(pos_cleaned)

        lemmatizer = WordNetLemmatizer() 
        text_lemmatized = lemmatizer.lemmatize(text_str)

        cleaned_text.append(text_lemmatized)
        
        text_str = " ".join(cleaned_text)
    
    return text_str

## LDA on 2015 Documents

In [95]:
cleaned_2015 = clean_text(corpus2015)

In [96]:
text_2015 = sent_tokenize(cleaned_2015)

In [97]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation as LDA

count_vectorizer = CountVectorizer()
count_data = count_vectorizer.fit_transform(text_2015)

In [98]:
number_topics = 5
number_words = 10

lda = LDA(n_components=number_topics, n_jobs=-1, random_state =33)
lda.fit(count_data)

def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
print_topics(lda, count_vectorizer, number_words)


Topic #0:
employers battery contravention restrain restrict convenes liquor bear coordination mail

Topic #1:
employers battery contravention restrain restrict convenes liquor bear coordination mail

Topic #2:
employers battery contravention restrain restrict convenes liquor bear coordination mail

Topic #3:
employers battery contravention restrain restrict convenes liquor bear coordination mail

Topic #4:
income company sales cash walmart operations ended year stores share


## LDA on 2016 Documents

In [99]:
cleaned_2016 = clean_text(corpus2016)
text_2016 = sent_tokenize(cleaned_2016)

In [100]:
count_data_2016 = count_vectorizer.fit_transform(text_2016)

lda = LDA(n_components=number_topics, n_jobs=-1, random_state =33)
lda.fit(count_data_2016)
        
print_topics(lda, count_vectorizer, number_words)


Topic #0:
ipad retaliation missouri bid mission resumed misinterpretation families miscalculation bestinclass

Topic #1:
plan walmart income company cash sales shares stock performance year

Topic #2:
ipad retaliation missouri bid mission resumed misinterpretation families miscalculation bestinclass

Topic #3:
ipad retaliation missouri bid mission resumed misinterpretation families miscalculation bestinclass

Topic #4:
ipad retaliation missouri bid mission resumed misinterpretation families miscalculation bestinclass


## LDA on 2017 Documents

In [101]:
cleaned_2017 = clean_text(corpus2017)
text_2017 = sent_tokenize(cleaned_2017)

count_data_2017 = count_vectorizer.fit_transform(text_2017)

lda = LDA(n_components=number_topics, n_jobs=-1, random_state =33)
lda.fit(count_data_2017)
        
print_topics(lda, count_vectorizer, number_words)


Topic #0:
sizes rp rst monro ellis rudoplh monies chairmans embrace moneyback

Topic #1:
securities notes debt company interest date series prospectus amount walmart

Topic #2:
sizes rp rst monro ellis rudoplh monies chairmans embrace moneyback

Topic #3:
sizes rp rst monro ellis rudoplh monies chairmans embrace moneyback

Topic #4:
sizes rp rst monro ellis rudoplh monies chairmans embrace moneyback


## LDA on 2018 Documents

In [102]:
cleaned_2018 = clean_text(corpus2018)
text_2018 = sent_tokenize(cleaned_2018)

count_data_2018 = count_vectorizer.fit_transform(text_2018)

lda = LDA(n_components=number_topics, n_jobs=-1, random_state =33)
lda.fit(count_data_2018)
        
print_topics(lda, count_vectorizer, number_words)


Topic #0:
zynga port populace ponds pollutant daytoday dbiie pol poison dbiif

Topic #1:
company notes securities date section stock plan interest shares walmart

Topic #2:
zynga port populace ponds pollutant daytoday dbiie pol poison dbiif

Topic #3:
zynga port populace ponds pollutant daytoday dbiie pol poison dbiif

Topic #4:
zynga port populace ponds pollutant daytoday dbiie pol poison dbiif


## LDA on 2019 Documents

In [103]:
cleaned_2019 = clean_text(corpus2019)
text_2019 = sent_tokenize(cleaned_2019)

count_data_2019 = count_vectorizer.fit_transform(text_2019)

lda = LDA(n_components=number_topics, n_jobs=-1, random_state =33)
lda.fit(count_data_2019)
        
print_topics(lda, count_vectorizer, number_words)


Topic #0:
lastmile salaried oo sabotage sac consultations construe saco gratia construct

Topic #1:
lastmile salaried oo sabotage sac consultations construe saco gratia construct

Topic #2:
securities debt company notes interest prospectus series income date walmart

Topic #3:
lastmile salaried oo sabotage sac consultations construe saco gratia construct

Topic #4:
lastmile salaried oo sabotage sac consultations construe saco gratia construct
