In [4]:
import re
import string
import os
import pickle
import spacy
spacy.prefer_gpu()

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS



In [5]:
# creating a list of the ciks we're interested in
f = open("final_ciks.txt", "r")
ciks = f.read().strip('[]').split(', ')
for cik in ciks:
    cik = str(cik).strip("'")
f.close()

In [6]:
ciks[:10]

["'1209028'",
 "'1069183'",
 "'825313'",
 "'1144980'",
 "'1800'",
 "'935036'",
 "'1113232'",
 "'796343'",
 "'7084'",
 "'1101215'"]

## Testing the workflow on a small subset of the texts

In [6]:
merged_cik_texts = []
for ind, cik in enumerate(tqdm(ciks)):
    # score_path = 'final_10ks/' + cik.strip("'") + '/rawtext/' + cik.strip("'") + '_sim_scores.csv'
    text_path = 'final_10ks/' + cik.strip("'") + '/merged_texts.p'
    with open(text_path, "rb" ) as f:
        cik_texts = pickle.load(f)
        merged_cik_texts += cik_texts
print(f'The total number of documents is: {len(merged_cik_texts)}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=303.0), HTML(value='')))




In [11]:
print(merged_cik_texts[0][:400])

 10-K 1 d10k.htm FORM 10-K  Form 10-K Table of Contents     UNITED STATES  SECURITIES AND EXCHANGE COMMISSION  Washington, D.C. 20549      FORM 10-K      (Mark One)    x ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934  For the fiscal year ended December 31, 2010  OR       ̈ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934


In [23]:
newtext = []
for entry in tqdm(merged_cik_texts[:3]):
    # getting rid of the terms seperately uses less memory
    entry = entry.replace('Form 10-K','')
    entry = entry.replace('FORM 10-K','')
    entry = entry.replace('10-K','')
    entry = re.sub(r'\d+', '', entry)#remove numbers
    entry = "".join([char.lower() for char in entry if char not in string.punctuation])
    #^^remove punctuation and make lowercase
    entry = entry.replace('table of contents','')
    entry = entry.replace('securities and exchange commission','')
    entry = entry.replace('annual report','')
    entry = entry.replace('real estate','realestate')
    entry = entry.replace('mayyet','')
    entry = "".join([char for char in entry if not char.isdigit()])
    #^^remove numbers
    entry = entry.replace('million','')
    entry = re.sub('\s+', ' ', entry).strip() #remove doublespaces
    newtext.append(entry)
print(newtext[0][:400])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))


dkhtm united states washington dc mark one x pursuant to section or d of the securities exchange act of for the fiscal year ended december or ̈ transition report pursuant to section or d of the securities exchange act of for the transition period from to commission file number arlington asset investment corp exact name of registrant as specified in its charter virginia state or other jurisdiction 


In [24]:
 
months = ['january','february','march','april','june','july','august','september','october','november','december']
stop_words = text.ENGLISH_STOP_WORDS.union(months + ['company', 'financial'])
vect=CountVectorizer(ngram_range=(1,1),stop_words=stop_words)
fin=vect.fit_transform(newtext)
#merged texts is the most recent one from the above generate files
pd.DataFrame(fin.toarray(),columns=vect.get_feature_names()).head()

Unnamed: 0,aa,aaa,aaaaa,aaarated,aada,aaic,aamount,ab,abilities,ability,...,yeartoyear,yes,yetpurchased,yield,yieldcost,yields,york,yorkbased,zero,zip
0,0,6,0,2,2,3,3,1,1,57,...,0,5,0,22,2,7,4,0,1,1
1,2,7,0,1,2,3,0,0,1,70,...,0,5,0,37,0,7,8,0,1,1
2,0,7,1,2,2,0,2,1,3,75,...,1,4,2,22,0,6,13,2,7,0


In [25]:
lda=LatentDirichletAllocation(n_components=5)
da_lda=lda.fit_transform(fin)
da_lda

array([[9.69794134e-06, 9.69794134e-06, 9.69794134e-06, 9.99960942e-01,
        9.96406167e-06],
       [8.07021700e-06, 8.07021700e-06, 8.07021700e-06, 9.99967508e-01,
        8.28103706e-06],
       [5.10535647e-06, 5.10535647e-06, 5.10535647e-06, 1.25419076e-03,
        9.98730493e-01]])

In [26]:
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
import mglearn
mglearn.tools.print_topics(topics=range(5), feature_names=features,sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
anticipates   anticipates   anticipates   mbs           securities    
expresses     expresses     expresses     securities    investment    
reposition    reposition    reposition    value         capital       
unduly        unduly        unduly        income        mortgage      
single        single        single        net           value         
took          took          took          stock         fbr           
task          task          task          assets        market        
enforce       enforce       enforce       cash          income        
prohibit      prohibit      prohibit      market        net           
malfunctioningmalfunctioningmalfunctioninginvestment    loans         




## Getting the aggregate list of documents to do topic modelling on

In [7]:
try:
    hahahaha #don't run this time
    with open("combined_texts.p", "rb" ) as f:
        newtext = pickle.load(f)
except:
    merged_cik_texts = []
    for ind, cik in enumerate(tqdm(ciks)):
        # score_path = 'final_10ks/' + cik.strip("'") + '/rawtext/' + cik.strip("'") + '_sim_scores.csv'
        text_path = 'final_10ks/' + cik.strip("'") + '/merged_texts.p'
        with open(text_path, "rb" ) as f:
            cik_texts = pickle.load(f)
            merged_cik_texts += cik_texts
    print(f'The total number of documents is: {len(merged_cik_texts)}')
    newtext = []
    for entry in tqdm(merged_cik_texts):
        # getting rid of the terms seperately uses less memory
        entry = entry.replace('Form 10-K','')
        entry = entry.replace('FORM 10-K','')
        entry = entry.replace('10-K','')
        entry = re.sub(r'\d+', '', entry)#remove numbers
        entry = "".join([char.lower() for char in entry if char not in string.punctuation])
        #^^remove punctuation and make lowercase
        entry = "".join([char for char in entry if not char.isdigit()])
        #^^remove numbers
        entry = entry.replace('table of contents','')
        entry = entry.replace('securities and exchange commission','')
        entry = entry.replace('annual report','')
        entry = entry.replace('loans','loan')
        entry = entry.replace('shares','share')
        entry = entry.replace('companys','company')
        entry = entry.replace('million','')
        entry = re.sub('\s+', ' ', entry).strip() #remove doublespaces
        newtext.append(entry)
    print(newtext[0][:400])
    with open("combined_texts.p", "wb" ) as f:
        pickle.dump(newtext, f)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=303.0), HTML(value='')))


The total number of documents is: 5619


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5619.0), HTML(value='')))


dkhtm united states washington dc mark one x pursuant to section or d of the securities exchange act of for the fiscal year ended december or ̈ transition report pursuant to section or d of the securities exchange act of for the transition period from to commission file number arlington asset investment corp exact name of registrant as specified in its charter virginia state or other jurisdiction 


In [8]:
newtext[1][:50].split()

['formkhtm', 'arlington', 'asset', 'investment', 'corp', 'formkhtm']

In [9]:
months = ['january','february','march','april','june','july','august','september','october','november','december']
freq_words = ['capital','management','creditors','brokerage','operating','product','products','business','value',
              'fair','statements','assets','operations','including','tax','consolidated','company', 'financial',
              'stock','cash','net','year', 'years','market','income','certain','related','costs','sales', 'new',
              'share', 'approximately', 'accounting', 'common', 'percent']
stop_words = text.ENGLISH_STOP_WORDS.union(months + freq_words)

nlp = spacy.blank('en')
lemma_docs = []
doc_skip_count = 0
for document in tqdm(newtext):
    try:
        lemma_text = []
        lemma_doc = nlp(document)
        lemma_words = [token.lemma_.strip() for token in lemma_doc]
        for lemma_word in lemma_words:
            if (len(lemma_word) > 2):
                lemma_text.append(lemma_word)
        lemma_docs.append(' '.join(lemma_text))
    except:
        doc_skip_count += 1
        print(f'The document skip count is: {doc_skip_count}', end = '\r')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5619.0), HTML(value='')))

The document skip count is: 5


In [10]:
vect=CountVectorizer(ngram_range=(1,1),stop_words=stop_words)
final_corpus=vect.fit_transform(lemma_docs)
#merged texts is the most recent one from the above generate files
pd.DataFrame(final_corpus.toarray(),columns=vect.get_feature_names()).head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaarated,aaaaamortgagebacked,aaaaaobligations,aaaaaprerefunded,aaaaarated,...,μgkg,μgl,μl,μscm,上a,公o,司i,技z术o,有l限a,海c斯z丹o赛u
0,0,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
lda=LatentDirichletAllocation(n_components=6)
da_lda=lda.fit_transform(final_corpus)
da_lda

array([[9.25268683e-06, 9.23162132e-01, 9.24886725e-06, 9.24058972e-06,
        9.24485334e-06, 7.68008815e-02],
       [4.75151838e-03, 9.20341950e-01, 7.65297986e-06, 7.64772726e-06,
        7.65315594e-06, 7.48835780e-02],
       [6.04883900e-02, 8.14814712e-01, 4.84116950e-06, 4.84314957e-06,
        4.84057328e-06, 1.24682373e-01],
       ...,
       [4.70946163e-06, 9.99269638e-01, 4.70976002e-06, 5.25025103e-04,
        4.70695415e-06, 1.91211058e-04],
       [4.29888770e-06, 9.95434592e-01, 1.32722929e-03, 1.08099217e-03,
        4.29652410e-06, 2.14859141e-03],
       [5.48926489e-02, 8.84989063e-01, 4.34009739e-02, 7.17412979e-05,
        7.18256029e-05, 1.65737470e-02]])

In [12]:
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
import mglearn
mglearn.tools.print_topics(topics=range(6), feature_names=features,sorting=sorting, topics_per_chunk=6, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       
--------      --------      --------      --------      --------      --------      
services      loan          share         gas           development   properties    
share         securities    fiscal        energy        share         share         
revenue       bank          plan          natural       agreement     ended         
ended         credit        ended         power         common        property      
customers     losses        credit        electric      clinical      common        
common        rate          rate          rate          future        partnership   
results       total         results       plan          research      loan          
fiscal        risk          based         cost          ended         real          
based         investment    notes         approximately health        agreement     
plan          share         accounting    oil           results  

In [13]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
#dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash = pyLDAvis.sklearn.prepare(lda, final_corpus, vect, mds='tsne')
#lda is good, data_vectorized
dash