In [7]:
import re
import string
import os
import pickle

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS



In [2]:
# creating a list of the ciks we're interested in
f = open("final_ciks.txt", "r")
ciks = f.read().strip('[]').split(', ')
for cik in ciks:
    cik = str(cik).strip("'")
f.close()

In [3]:
ciks[:10]

["'1209028'",
 "'1069183'",
 "'825313'",
 "'1144980'",
 "'1800'",
 "'935036'",
 "'1113232'",
 "'796343'",
 "'7084'",
 "'1101215'"]

In [6]:
# now to create an aggregate text body...
for ind, cik in enumerate(tqdm(ciks)):
    path = '10ks/' + cik.strip("'") + '/rawtext'
    if ind < 10:
        print('Processing contents of ' + path + ' :')
    f_list = os.listdir(path)
    merged_texts = []
    for entry, f_name in enumerate(f_list):
        if entry < 2 and ind < 5:
            print('\t' + str(f_name))
        f = open(path + '/' + f_name, "r")
        f = f.read()
        #print(f'length of doc: {len(f)}')
        merged_texts.append(f)
        #print(f'length of merged_doc: {len(merged_texts)}')
    if ind < 5:
        print('\t...')
    path = '10ks/' + cik.strip("'")
    if ind < 5:
        print('\tcreated: ' + path + "/merged_texts.p")
    pickle.dump(merged_texts, open(path + '/merged_texts.p', "wb" ) )
#     merged_doc_file = open(path + "/merged_texts.txt","w")
#     merged_doc_file.write(merged_texts)
#     merged_doc_file.close()
print('...')

HBox(children=(FloatProgress(value=0.0, max=303.0), HTML(value='')))

Processing contents of 10ks/1209028/rawtext :
	1209028_2011-02-11.txt
	1209028_2012-02-23.txt
	...
	created: 10ks/1209028/merged_texts.txt
Processing contents of 10ks/1069183/rawtext :
	1069183_2017-03-06.txt
	1069183_2011-03-14.txt
	...
	created: 10ks/1069183/merged_texts.txt
Processing contents of 10ks/825313/rawtext :
	825313_2016-02-11.txt
	825313_2003-03-27.txt
	...
	created: 10ks/825313/merged_texts.txt
Processing contents of 10ks/1144980/rawtext :
	1144980_2011-02-28.txt
	1144980_2008-02-29.txt
	...
	created: 10ks/1144980/merged_texts.txt
Processing contents of 10ks/1800/rawtext :
	1800_2008-02-19.txt
	1800_2011-02-18.txt
	...
	created: 10ks/1800/merged_texts.txt
Processing contents of 10ks/935036/rawtext :
	935036_2000-12-29.txt
	935036_2003-12-23.txt
	...
	created: 10ks/935036/merged_texts.txt
Processing contents of 10ks/1113232/rawtext :
	1113232_2010-03-15.txt
	1113232_2002-03-12.txt
	...
	created: 10ks/1113232/merged_texts.txt
Processing contents of 10ks/796343/rawtext :
	7

## Testing the workflow on a small subset of the texts

In [27]:
newtext = []
for entry in tqdm(merged_texts):
    # getting rid of the terms seperately uses less memory
    entry = entry.replace('Form 10-K','')
    entry = entry.replace('FORM 10-K','')
    entry = entry.replace('10-K','')
    entry = re.sub(r'\d+', '', entry)#remove numbers
    entry = "".join([char.lower() for char in entry if char not in string.punctuation])
    #^^remove punctuation and make lowercase
    entry = "".join([char for char in entry if not char.isdigit()])
    #^^remove numbers
    entry = "".join([char for char in entry if not char.isdigit()])
    #^^remove numbers
    entry = entry.replace('million','')
    entry = re.sub('\s+', ' ', entry).strip() #remove doublespaces
    newtext.append(entry)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [40]:
from sklearn.feature_extraction import text 
months = ['january','february','march','april','june','july','august','september','october','november','december']
stop_words = text.ENGLISH_STOP_WORDS.union(months + ['company', 'financial'])
vect=CountVectorizer(ngram_range=(1,1),stop_words=stop_words)
fin=vect.fit_transform(newtext)
#merged texts is the most recent one from the above generate files
pd.DataFrame(fin.toarray(),columns=vect.get_feature_names()).head()

Unnamed: 0,aa,aaa,aaaaarated,aaarated,aarated,ab,abbreviationsaclallowance,abbreviationsascaccounting,abcp,abetted,...,zion,zionk,zions,zionsbank,zionw,zionxkhtm,zionz,zip,zmfu,zmsc
0,0,0,0,0,0,0,1,0,0,0,...,1,0,122,0,0,1,0,1,0,0
1,0,0,0,0,0,0,1,0,0,0,...,1,0,127,0,0,1,0,1,0,1
2,11,8,0,10,1,0,0,0,1,0,...,1,0,230,0,0,0,0,1,0,7
3,0,0,0,0,0,0,1,0,0,1,...,1,0,138,0,4,1,3,1,0,0
4,2,1,0,4,0,0,0,0,0,0,...,1,0,203,0,0,0,0,1,0,0


In [41]:
lda=LatentDirichletAllocation(n_components=5)
da_lda=lda.fit_transform(fin)
da_lda

array([[9.98117495e-01, 1.22809981e-03, 5.50534695e-06, 6.43277716e-04,
        5.62185245e-06],
       [9.96979596e-01, 2.75999102e-03, 5.69040728e-06, 2.48911186e-04,
        5.81182077e-06],
       [6.16682087e-06, 3.72186013e-05, 6.01336270e-06, 9.99944478e-01,
        6.12336050e-06],
       [9.97533436e-01, 1.48757181e-03, 5.59244593e-06, 9.67685094e-04,
        5.71419809e-06],
       [6.93913492e-06, 6.94507288e-06, 6.76274153e-06, 9.99972483e-01,
        6.87049130e-06],
       [8.51298712e-01, 7.13314669e-05, 6.98082674e-05, 1.48489067e-01,
        7.10806712e-05],
       [9.60359440e-01, 3.87665756e-02, 5.84160268e-06, 8.62172582e-04,
        5.96979628e-06],
       [4.73148644e-01, 5.26834882e-01, 5.39708229e-06, 5.57523393e-06,
        5.50161704e-06],
       [9.98544789e-01, 7.63992186e-05, 7.47990476e-05, 1.22835569e-03,
        7.56568744e-05],
       [6.87625392e-01, 3.11806496e-01, 5.47949544e-06, 5.57040540e-04,
        5.59177291e-06],
       [6.48152181e-01, 6.3803

In [42]:
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
import mglearn
mglearn.tools.print_topics(topics=range(5), feature_names=features,sorting=sorting, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
capital       securities    availablefor  securities    securities    
bank          loans         excerpts      net           net           
loans         stock         resultsof     income        losses        
securities    value         andinterest   loans         loans         
loan          net           andfinancial  value         fair          
value         income        analytic      loan          loss          
risk          fair          staffs        bank          value         
stock         rate          question      zions         assets        
credit        loan          think         assets        total         
income        credit        asked         credit        loan          




## Getting the aggregate list of documents to do topic modelling on

In [44]:
newtext = []
for ind, cik in enumerate(tqdm(ciks)):
    path = '10ks/' + cik.strip("'") + '/merged_texts.p'
    cik_docs = pickle.load(open(path,"rb"))
    for entry in cik_docs:
        # getting rid of the terms seperately uses less memory
        entry = entry.replace('Form 10-K','')
        entry = entry.replace('FORM 10-K','')
        entry = entry.replace('10-K','')
        entry = re.sub(r'\d+', '', entry)#remove numbers
        entry = "".join([char.lower() for char in entry if char not in string.punctuation])
        #^^remove punctuation and make lowercase
        entry = "".join([char for char in entry if not char.isdigit()])
        #^^remove numbers
        entry = "".join([char for char in entry if not char.isdigit()])
        #^^remove numbers
        entry = entry.replace('million','')
        entry = re.sub('\s+', ' ', entry).strip() #remove doublespaces
        newtext.append(entry)
print(f'the number of documents is {len(newtext)}')

HBox(children=(FloatProgress(value=0.0, max=303.0), HTML(value='')))


the number of documents is 5619


In [70]:
print(u"Apples and oranges are similar. Boots and hippos aren't.")

Apples and oranges are similar. Boots and hippos aren't.


In [71]:
import spacy
spacy.prefer_gpu()
nlp = spacy.load('en')
lemma_docs = []

for doc in tqdm(newtext):
    lemma_text = []
    #for word in nlp(doc.split()):
    for word in nlp(doc):
        lemma_text.append(word.lemma_)
    lemma_docs.append(' '.join(lemma_text))

HBox(children=(FloatProgress(value=0.0, max=5619.0), HTML(value='')))




ValueError: [E088] Text of length 1015812 exceeds maximum of 1000000. The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [60]:
months = ['january','february','march','april','june','july','august','september','october','november','december']
freq_words = ['operating','product','products','business','value','fair','statements','assets','operations','including','tax','consolidated','company', 'financial', 'stock','cash','net','year', 'years','market','income','certain','related','costs']
stop_words = text.ENGLISH_STOP_WORDS.union(months + freq_words)
vect=CountVectorizer(ngram_range=(1,1),stop_words=stop_words)
fin=vect.fit_transform(newtext)
#merged texts is the most recent one from the above generate files
pd.DataFrame(fin.toarray(),columns=vect.get_feature_names()).head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaarated,aaaaamortgagebacked,aaaaaobligations,aaaaaprerefunded,aaaaarated,...,μgl,μl,μm,μscm,上a,公o,司i,技z术o,有l限a,海c斯z丹o赛u
0,0,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
lda=LatentDirichletAllocation(n_components=8)
da_lda=lda.fit_transform(fin)
da_lda

array([[1.01199838e-01, 6.70255610e-06, 8.97449354e-01, ...,
        6.70347905e-06, 6.70305811e-06, 1.31729366e-03],
       [8.73662711e-02, 5.54766558e-06, 8.87730861e-01, ...,
        5.54794995e-06, 5.54788537e-06, 2.48751289e-02],
       [9.26630629e-02, 3.48252718e-06, 7.97850576e-01, ...,
        3.48241997e-06, 3.48218225e-06, 1.09468949e-01],
       ...,
       [3.41499665e-06, 2.25864869e-04, 9.98685543e-01, ...,
        3.41308068e-06, 3.41291187e-06, 3.41312090e-06],
       [2.23621367e-03, 7.57057235e-04, 9.93620352e-01, ...,
        3.11389427e-06, 1.24793326e-03, 3.11428061e-06],
       [5.11462498e-05, 5.11402366e-05, 8.11735197e-01, ...,
        5.11501992e-05, 1.87957919e-01, 5.11515979e-05]])

In [62]:
sorting=np.argsort(lda.components_)[:,::-1]
features=np.array(vect.get_feature_names())
import mglearn
mglearn.tools.print_topics(topics=range(8), feature_names=features,sorting=sorting, topics_per_chunk=8, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       topic 5       topic 6       topic 7       
--------      --------      --------      --------      --------      --------      --------      --------      
properties    energy        loans         development   gas           sales         fiscal        services      
property      gas           loan          agreement     oil           plan          sales         revenue       
common        electric      securities    common        production    ended         plan          fiscal        
ended         power         bank          clinical      natural       agreement     incorporated  ended         
real          ppl           credit        research      reserves      rate          credit        customers     
debt          rate          capital       sales         health        credit        stores        sales         
shares        plan          losses        shares        future        notes         ended       

## Saved Cell(s) for Reference

In [16]:
# now the final doc
merged_text = []
for ind, cik in enumerate(tqdm(ciks)):
    f_path = '10ks/' + cik.strip("'") + '/merged_texts.p'
    f = open(f_path, "r")
    f = f.read()
    merged_text.append(f)
complete_doc_file = open("all_texts.txt","w") 
complete_doc_file.write(str(merged_texts))
complete_doc_file.close()

HBox(children=(FloatProgress(value=0.0, max=303.0), HTML(value='')))


