In [5]:
import spacy

import re
import os
from time import gmtime, strftime
from datetime import datetime, timedelta
import unicodedata
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import requests
import bs4 as bs
from lxml import html
from tqdm import tqdm

# Spacy Test Run

In this notebook we'll get a feel for some basic spacy functionality using Tesla's 2017 10K (based on the similarity score PDF [here](https://github.com/ruthlee/10K_analysis/blob/master/data/%20tsla.pdf) it looks like it'll be an interesting one. In particular, we want to focus on the "RISK" section of the 10K. According to [Lazy Prices](https://github.com/ruthlee/10K_analysis/blob/master/Research/initial_notes.md) that's the section to focus on. Luckily we already have the raw text from our data scraping.

In [6]:
# our folder for raw text is labeled with CIK, so we import this function so we know what we're dealing with.

def TickertoCIK(tickers):
    url = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    cik_re = re.compile(r'.*CIK=(\d{10}).*')

    cik_dict = {}
    for ticker in tqdm(tickers): # Use tqdm lib for progress bar
        results = cik_re.findall(requests.get(url.format(ticker)).text)
        if len(results):
            cik_dict[str(ticker).lower()] = str(results[0])
    
    return cik_dict

In [8]:
cik_dict = TickertoCIK(['tsla'])
cik_dict

100%|██████████| 1/1 [00:00<00:00,  8.53it/s]


{'tsla': '0001318605'}

In [39]:
cik = cik_dict['tsla']
date = '2017-03-01'
with open("data/10K/" + cik + '/rawtext/' + cik + '_' + date + '.txt', 'r') as myfile:
    K = myfile.read()

In [53]:
# K

In [34]:
print(len(K))

588050


In [57]:
K[0:100]

' 10-K 1 tsla-10k_20161231.htm 10-K    tsla-10k_20161231.htm       UNITED STATES  SECURITIES AND EXCH'

The entire 10K is too long to do an effective NLP analysis (I think) because there's a lot of extraneous information. Let's just stick with the 'Risk Factors' section, which is always "Item 1A"

In [139]:
def find_index(corpus, phrase):
    '''
    Returns the index of the first letter in a (unique) phrase within a corpus. Note that it will return the 
    index of the first instance of the phrase within a corpus, so make sure not to search something generic.
    
    corpus = string to search through
    phrase = string you're searching for 
    '''
    if phrase in corpus:
        index = corpus.index(phrase)
    return index
    
def isolate_risk(K):
    '''
    Returns only the risk factor section of the Telsa 10K string. This only works for Tesla because I've found 
    the unique phrases that mark the start and end of their risk factors section.
    
    K = raw text of 10K as a string. 
    '''
    # first get rid of the table of contents
    forward_index = find_index(K, 'Overview') # first phrase after the table of contents
    l = list(K)
    l = l[forward_index::]
    K = ''.join(l)
    
    # next find the index of the risk factors section and get rid of everything else
    start_index = find_index(K, 'RISK FACTORS')
    end_index = find_index(K, 'ITEM 1B.')
    
    l = list(K)
    l = l[start_index:end_index]
    K = ''.join(l)
    
    return(K)

risk_factors = isolate_risk(K)
risk_factors

1022




In [146]:
nlp = spacy.load('en')
corpus = nlp(risk_factors)

tokens = []
for token in corpus: 
    tokens.append(token)

# Spacy can split by token/word and sentence 
sentences = []
for sent in corpus.sents:
    sentences.append(sent)

In [147]:
print(sentences[100]) # example
print(len(sentences))

In order to meet these expectations, we may in the future be required to introduce on a regular basis new vehicle models as well as enhanced versions of existing vehicle models.
437


In [None]:
# we need to clean up the text by removing stopwords. 

