# NLP Analysis of Financial Returns of Public Firms

## Download Packages

In [0]:
import nltk
import pickle
import gensim
import pandas as pd
import requests
import string

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.stem.porter import *
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim import corpora
from sklearn.manifold import TSNE
from bokeh.io import output_notebook, output_file
from bokeh.plotting import show, figure
from bs4 import BeautifulSoup
from keras.preprocessing.text import one_hot

%matplotlib inline

Using TensorFlow backend.


In [0]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

## Retrieve Dataset

In [0]:
urls = []
urls.append("https://www.sec.gov/Archives/edgar/data/886982/000119312518056383/d480167d10k.htm")
urls.append("https://www.sec.gov/Archives/edgar/data/886982/000119312517056804/d308759d10k.htm")
urls.append("https://www.sec.gov/Archives/edgar/data/895421/000119312517059212/d328282d10k.htm")
urls.append("https://www.sec.gov/Archives/edgar/data/895421/000119312518060831/d500533d10k.htm")


tgtUrl = 'https://www.sec.gov/Archives/edgar/data/886982/000119312519050198/d669877d10k.htm'
pages = ''
for url in urls:
  pages += requests.get(url).text

In [0]:
# Read the HTML using BeautifulSoup. Fallback 'html.parser' in case lxml has challages \n",
soup = BeautifulSoup(pages, "lxml")  

In [0]:
# Find all 'div' and 'p' tags as these are the ones that contain data in our documents. Maintain the order of text.
tagTypes = ['div', 'p']
tags = soup.find_all(tagTypes)

In [0]:
origTxt = ''
for t in tags:
    origTxt += t.text
origTxt

## Data Cleanup

Now we have the raw text data. We need to clean it up to remove stop words, punctuations, and other common trivial patterns. 

This text has several "\xa0" characters which need to be replaced. Start data clean up with these characters. Refer to:

https://stackoverflow.com/questions/10993612/python-removing-xa0-from-string

In [0]:
intermediateTxt = origTxt.replace(u'\xa0', u' ')

Now, clean stop words.

In [0]:
cleanedTxt = ''
stopWords = set(stopwords.words('english') + list(string.punctuation))

In [0]:
intermediateTokens = nltk.word_tokenize(intermediateTxt)
len(intermediateTokens)

1374336

In [0]:
cleanTokens = []
for w in intermediateTokens:
    if w not in stopWords:
        cleanTokens.append(w.lower())
        cleanTokens.append(' ') # Need to append a single space for cases where words are losing space in between

len(cleanTokens)

1655098

In [0]:
cleanedTxt = ''
cleanedTxtLst = []
for token in cleanTokens:
  if token != ' ':
    cleanedTxtLst.append(token)
    cleanedTxt += (token)
cleanedTxt = cleanedTxt.replace('  ', ' ')  

It is important to first tokenize and then match individual word against stop words. If we simply search for a stop word in the entire string and remove it, we will lose some important information. Consider example of a word "I.R.S". If the logic is to remove stop words and punctuation from entire string in one go, then the dots within this word will get removed. On the other hand, if we tokenize first then the comparison will be with entire "I.R.S" word as a token and therefore the dots inside will not get removed. This is one simple example but I have seen better results when stop words removal is done after tokenization. If you need a single string, then simply concatenate all tokens in a list.

In [0]:
# Optional: remove numbers and any words containing numbers from our cleaned tokens:
# cleanedTxt = re.sub('\\w*\\d\\w*', '', cleanedTxt)

## Identifying Phrases in Corpus

A consistent challenge in NLP is to differentiate indivudla words from words appear in a phrase or as full form. Examples would be "New York" or "United State of America". Without any effort, these phrases or unabbreviated forms will show up as continuous but still different words. 

In order to identify these, we will have to start with original text, and divide it among sentences. Then use magic of gensim to highlight such phrases. Let us attempt it then.

## Clean and Arrange Original Text into a List of Sentences

In [0]:
# Start with original text scraped from web resources. Check that it wasn't inadvertently modified.
origTxt

In [0]:
intermediateTxt = origTxt.replace(u'\xa0', u' ')

In [0]:
from nltk.tokenize import PunktSentenceTokenizer
sents_tokenized = sent_tokenize(intermediateTxt)
sents_tokenized

['\nUNITED STATES SECURITIES AND EXCHANGE COMMISSION \nWashington, D.C. 20549   \n  Form 10-K  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF \nTHE SECURITIES EXCHANGE ACT OF 1934   \n   \n\n\n\n\n\n\n For the fiscal year ended December 31, 2017\n \nCommission File Number: 001-14965\n The Goldman Sachs Group, Inc. \n(Exact name of registrant as specified in its charter)   \n\n\n\n\n\n\nDelaware\n \n13-4019460\n\n (State or other jurisdiction of\nincorporation or organization)\n \n (I.R.S.',
 'Employer\nIdentification No.)',
 '200 West Street\n \n10282\n\n New York, N.Y.\n(Address of principal executive offices)\n \n(Zip Code)\n (212) 902-1000 \n(Registrant\x92s telephone number, including area code) \nSecurities registered pursuant to Section 12(b) of the Act:   \n\n\n\n\n\n\nTitle of each class:\n \nName of each exchange on which registered:\n\n Common stock, par value $.01 per share\n \nNew York Stock Exchange\n\n Depositary Shares, Each Representing 1/1,000th Interest in a Share o

So sentences are recognized. We still need to do some data cleaning here as well. For that, we will create tokens per sentence, clean punctuations at that point, and then create sentences again, and finally will push them into a list of sentences. 

In [0]:
sents_ClnTknzd = []
punctuations = list(string.punctuation) # only remove punctuations. Keep stop words for phrases and un-abbreviated forms

for sent in sents_tokenized:
  tempStr = ''
  tempTokens = nltk.word_tokenize(sent)
  for token in tempTokens:
    if token not in punctuations:
        tempStr += (token)
        tempStr += ' '
        #cleanTokens.append(' ') # Need to append a single space for cases where words are losing space in between
  
  sents_ClnTknzd.append(tempStr.strip())

sents_ClnTknzd

['UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington D.C. 20549 Form 10-K ANNUAL REPORT PURSUANT TO SECTION 13 OR 15 d OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31 2017 Commission File Number 001-14965 The Goldman Sachs Group Inc. Exact name of registrant as specified in its charter Delaware 13-4019460 State or other jurisdiction of incorporation or organization I.R.S',
 'Employer Identification No',
 '200 West Street 10282 New York N.Y. Address of principal executive offices Zip Code 212 902-1000 Registrant\x92s telephone number including area code Securities registered pursuant to Section 12 b of the Act Title of each class Name of each exchange on which registered Common stock par value .01 per share New York Stock Exchange Depositary Shares Each Representing 1/1,000th Interest in a Share of Floating Rate Non-Cumulative Preferred Stock Series A New York Stock Exchange Depositary Shares Each Representing 1/1,000th Interest in a Share of 6.20 N

## Document Term Matrix 
Next, we will generate a document term matrix for a bag of words.

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit(sents_ClnTknzd)
data_cv
#data_dtm = pd.DataFrame(data_cv.toarray(), columns = cv.get_feature_names())
#data_dtm

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

### Identifying Phrases
Now that the sentences are nicely cleaned and segregated, let us look to identify a group of words that generally go together like "New York". That requires some tr
aining up-front. The following code is not functional right now. Enhance it. 

In [0]:
#texts = [[word for words in sent.split()] for sent in sents_ClnTknzd]
texts = []
for sent in sents_ClnTknzd:
  tempList = []
  words = sent.split()
  for word in words:
    tempList.append(word)
  texts.append(tempList)
  
texts  

[['UNITED',
  'STATES',
  'SECURITIES',
  'AND',
  'EXCHANGE',
  'COMMISSION',
  'Washington',
  'D.C.',
  '20549',
  'Form',
  '10-K',
  'ANNUAL',
  'REPORT',
  'PURSUANT',
  'TO',
  'SECTION',
  '13',
  'OR',
  '15',
  'd',
  'OF',
  'THE',
  'SECURITIES',
  'EXCHANGE',
  'ACT',
  'OF',
  '1934',
  'For',
  'the',
  'fiscal',
  'year',
  'ended',
  'December',
  '31',
  '2017',
  'Commission',
  'File',
  'Number',
  '001-14965',
  'The',
  'Goldman',
  'Sachs',
  'Group',
  'Inc.',
  'Exact',
  'name',
  'of',
  'registrant',
  'as',
  'specified',
  'in',
  'its',
  'charter',
  'Delaware',
  '13-4019460',
  'State',
  'or',
  'other',
  'jurisdiction',
  'of',
  'incorporation',
  'or',
  'organization',
  'I.R.S'],
 ['Employer', 'Identification', 'No'],
 ['200',
  'West',
  'Street',
  '10282',
  'New',
  'York',
  'N.Y.',
  'Address',
  'of',
  'principal',
  'executive',
  'offices',
  'Zip',
  'Code',
  '212',
  '902-1000',
  'Registrant\x92s',
  'telephone',
  'number',
  'in

In [0]:
#cleanTokens
dictGensim = corpora.Dictionary(texts)


In [0]:
print(dictGensim.token2id)

{'001-14965': 0, '10-K': 1, '13': 2, '13-4019460': 3, '15': 4, '1934': 5, '2017': 6, '20549': 7, '31': 8, 'ACT': 9, 'AND': 10, 'ANNUAL': 11, 'COMMISSION': 12, 'Commission': 13, 'D.C.': 14, 'December': 15, 'Delaware': 16, 'EXCHANGE': 17, 'Exact': 18, 'File': 19, 'For': 20, 'Form': 21, 'Goldman': 22, 'Group': 23, 'I.R.S': 24, 'Inc.': 25, 'Number': 26, 'OF': 27, 'OR': 28, 'PURSUANT': 29, 'REPORT': 30, 'SECTION': 31, 'SECURITIES': 32, 'STATES': 33, 'Sachs': 34, 'State': 35, 'THE': 36, 'TO': 37, 'The': 38, 'UNITED': 39, 'Washington': 40, 'as': 41, 'charter': 42, 'd': 43, 'ended': 44, 'fiscal': 45, 'in': 46, 'incorporation': 47, 'its': 48, 'jurisdiction': 49, 'name': 50, 'of': 51, 'or': 52, 'organization': 53, 'other': 54, 'registrant': 55, 'specified': 56, 'the': 57, 'year': 58, 'Employer': 59, 'Identification': 60, 'No': 61, '.01': 62, '1/1,000th': 63, '10282': 64, '12': 65, '200': 66, '212': 67, '405': 68, '5.50': 69, '6.20': 70, '6.30': 71, '6.375': 72, '902-1000': 73, '99.2': 74, 'A': 7

In [0]:

mycorpus = [dictGensim.doc2bow(doc, allow_update=True) for doc in tokenized_list]

NameError: ignored

In [0]:
phrases = Phrases(sents_ClnTknzd)
bigram = Phraser(phrases)
bigram.phrasegrams


In [0]:
bigram = Phrases(sents_ClnTknzd, min_count=1, threshold=2)
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
print(bigram[sent])

['the', 'mayor', 'of', 'new', 'york', 'was', 'there']




## Word2Vec Model

In [0]:
model = Word2Vec(sentences=sents_ClnTknzd, size=64, sg=1, window=10, min_count=3, seed=42, workers=8)
model.save('LargeBanksNLP.w2v')

In [0]:
word_vectors = model.wv

## Sentiment Analysis

In [0]:
import textblob
from textblob import TextBlob

In [0]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

In [0]:
# Capture these values. 
for sentence in sents_ClnTknzd:
  TextBlob(sentence).sentiment

Rather than calculating sentiment of the entire corpus in one go, let us check the sentiment by sentence. 

## Pending / Possible Enhancements:

Priority Items


Low Priority Items
- P1: Train (retrieve?) the system for identiyfing key financial phrases: "Balance Sheet", " Cash Flow", 

- P1: Identify ways to capture NER without significant performance hit. 

- P1: Calculate Word2Vec and possible use-cases: words used in similar context. 

- P1: Capture verbiage for quarterly calls 
