# Topic Modeling Text from 10ks

In [3]:
import re
import string
import os
import pickle
import mglearn
import spacy
spacy.prefer_gpu()

import pandas as pd
import numpy as np

import pyLDAvis.sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from tqdm.notebook import tqdm



In [5]:
# creating a list of the ciks we're interested in
f = open("final_ciks.txt", "r")
ciks = f.read().strip('[]').split(', ')
for cik in ciks:
    cik = str(cik).strip("'")
f.close()
ciks[:10]

["'1209028'",
 "'1069183'",
 "'825313'",
 "'1144980'",
 "'1800'",
 "'935036'",
 "'1113232'",
 "'796343'",
 "'7084'",
 "'1101215'"]

## Loading/Aggregating the Documents

In [13]:
try:
    hahahaha #don't run
    with open("combined_texts.p", "rb" ) as f:
        newtext = pickle.load(f)
except:
    merged_cik_texts = []
    for ind, cik in enumerate(tqdm(ciks)):
        # score_path = 'final_10ks/' + cik.strip("'") + '/rawtext/' + cik.strip("'") + '_sim_scores.csv'
        text_path = 'final_10ks/' + cik.strip("'") + '/merged_texts.p'
        with open(text_path, "rb" ) as f:
            cik_texts = pickle.load(f)
            merged_cik_texts += cik_texts
    print(f'The total number of documents is: {len(merged_cik_texts)}')
    newtext = []
    for entry in tqdm(merged_cik_texts):
        # getting rid of the terms seperately uses less memory
        entry = entry.replace('Form 10-K','')
        entry = entry.replace('FORM 10-K','')
        entry = entry.replace('10-K','')
        entry = re.sub(r'\d+', '', entry)#remove numbers
        entry = "".join([char.lower() for char in entry if char not in string.punctuation])
        #^^remove punctuation and make lowercase
        entry = "".join([char for char in entry if (not char.isdigit()) or (char == "'")])
        #^^remove numbers
        entry = entry.replace('table of contents','')
        entry = entry.replace('securities and exchange commission','')
        entry = entry.replace('annual report','')
        entry = entry.replace('loans','loan')
        entry = entry.replace('shares','share')
        entry = entry.replace('real estate','realestate')
        # entry = entry.replace('companys','company')
        #shouldn't be necessary if apostrophes are kept
        entry = entry.replace('million','')
        entry = re.sub('\s+', ' ', entry).strip() #remove doublespaces
        newtext.append(entry)
    with open("combined_texts.p", "wb" ) as f:
        pickle.dump(newtext, f)
    print(newtext[0][:400])

  and should_run_async(code)
  entry = re.sub('\s+', ' ', entry).strip() #remove doublespaces


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=303.0), HTML(value='')))


The total number of documents is: 5619


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5619.0), HTML(value='')))




## Lemmatizing, Removing Stop Words and Vectorizing

In [8]:
nlp = spacy.blank('en')
lemma_docs = []
doc_skip_count = 0
for document in tqdm(newtext):
    try:
        lemma_text = []
        lemma_doc = nlp(document)
        lemma_words = [token.lemma_.strip() for token in lemma_doc]
        for lemma_word in lemma_words:
            if (len(lemma_word) > 2):
                lemma_text.append(lemma_word)
        lemma_docs.append(' '.join(lemma_text))
    except:
        doc_skip_count += 1
        print(f'The document skip count is: {doc_skip_count}', end = '\r')
        
months = ['january','february','march','april','june','july','august','september','october','november','december']
freq_words = ['capital','management','creditors','brokerage','operating','product','products','business','value',
              'fair','statements','assets','operations','including','tax','consolidated','company', 'financial',
              'stock','cash','net','year', 'years','market','income','certain','related','costs','sales', 'new',
              'share', 'approximately', 'accounting', 'common', 'percent']
stop_words = text.ENGLISH_STOP_WORDS.union(months + freq_words)

vect=CountVectorizer(ngram_range=(1,1),stop_words=stop_words)
vectorized_corpus=vect.fit_transform(lemma_docs)
pd.DataFrame(final_corpus.toarray(),columns=vect.get_feature_names()).head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5619.0), HTML(value='')))

The document skip count is: 5


## Performing LDA and Visualizing the Results

In [10]:
lda=LatentDirichletAllocation(n_components=6)
lda.fit_transform(final_corpus)

array([[9.45686205e-06, 1.69000528e-02, 9.45964372e-06, 9.45245309e-06,
        9.68861284e-04, 9.82102717e-01],
       [1.40698002e-04, 1.83590385e-02, 9.68342057e-05, 7.77942074e-06,
        1.58700399e-02, 9.65525610e-01],
       [4.64137344e-04, 7.82188703e-02, 2.85209801e-04, 4.91869040e-06,
        6.38122304e-02, 8.57214633e-01],
       ...,
       [4.80914179e-06, 4.80962003e-06, 4.81657837e-06, 1.47466312e-04,
        4.81110070e-06, 9.99833287e-01],
       [4.39284309e-06, 9.68558478e-04, 1.62975719e-04, 3.13684756e-04,
        1.86430962e-03, 9.96686079e-01],
       [7.34412944e-05, 7.34660116e-05, 9.62999513e-03, 7.34459154e-05,
        1.16959929e-01, 8.73189722e-01]])

In [11]:
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(6), feature_names=features,sorting=sorting, topics_per_chunk=6, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       
--------      --------      --------      --------      --------      --------      
development   properties    health        energy        fiscal        loan          
future        property      services      gas           ended         securities    
oil           ended         agreement     rate          plan          bank          
agreement     agreement     care          plan          services      credit        
research      partnership   ended         cost          results       losses        
ended         debt          notes         power         based         rate          
clinical      rate          plan          electric      revenue       total         
revenue       notes         based         total         credit        risk          
price         lease         information   ended         information   investment    
gas           filed         healthcare    credit        customers

In [12]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, vectorized_corpus, vect, mds='tsne')
dash