# 10-K Data Handing
## Located the following list and tokenize words from 10-K Data
- Business（Item 1）
- Risk Factors（Item 1A）
- Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities（Item 5）
- Management’s Discussion and Analysis of Financial Condition and Results of Operations（Item 7）

In [1]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import codecs
import pickle
import re
from bs4 import BeautifulSoup

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

stopwords = set(stopwords.words('english'))
symbols   = ['.', ',', '’','•','“','”','"', "''" ,'|',"'",'*','``','...', '**','$','%','&','#','-','--',"''",'""','?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"'s","'m","n't",'—']
meaningless = ['u','could','may','might','result','and/or']

stopwords.update(symbols)
stopwords.update(meaningless)


def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
        
def getTag(ItemNo):     
    
    # Find Item location
    Item_TAG = soup.find_all('div', string=re.compile(r"^I[tT][eE][mM]."+str(ItemNo)+"[\W]"))
    if len(Item_TAG) == 0:
        print('Item is not in <div>')
        Item_TAG = soup.find_all('p', string=re.compile(r"^I[tT][eE][mM]."+str(ItemNo)+"[\W]"))
        if len(Item_TAG) == 0:
            print('Item is not in <p>')
            Item_TAG = soup.find_all('font', string=re.compile(r"^I[tT][eE][mM]."+str(ItemNo)+"[\W]"))
            if len(Item_TAG) == 0:
                print('Item is not in <font>')
                print('Get Error !')
            
    # Return the last found tag 
    return Item_TAG[-1]

def getContentByBS4(soup,constraint=5000):
    Data,Item1,Item5,Item7 = [],[],[],[]
    lemmatizer = WordNetLemmatizer()
    content = None
    tagType='div'

    # Item 1           
    for tag in getTag(1):
        i = 0
        for element in tag.next_elements:
            if element.name == tagType and element.string != None and element.string != content:
                content = element.string
                Item1.append(content)
                Data.append([lemmatizer.lemmatize(word, pos="n")  for word in word_tokenize(content.lower()) 
                             if lemmatizer.lemmatize(word, pos="n") not in stopwords and not bool(re.search(r"\d", str(lemmatizer.lemmatize(word, pos="n")))) ])
                i += 1
            if bool(re.search(r"^I[tT][eE][mM].2[\W]", str(element.string))):
                print('Item 1 finished with',len(Item1),'elements')
                break
            if i > constraint:
                break    

    # Item 5           
    for tag in getTag(5):
        i = 0
        for element in tag.next_elements:
            if element.name == tagType and element.string != None and element.string != content:
                content = element.string
                Item5.append(content)
                Data.append([lemmatizer.lemmatize(word, pos="n")  for word in word_tokenize(content.lower()) 
                             if lemmatizer.lemmatize(word, pos="n") not in stopwords and not bool(re.search(r"\d", str(lemmatizer.lemmatize(word, pos="n")))) ])
                i += 1 
            if bool(re.search(r"^I[tT][eE][mM].6[\W]", str(element.string))):
                print('Item 5 finished with',len(Item5),'elements')
                break
            if i > constraint:
                break

    # Item 7
    for tag in getTag(7):
        i = 0
        for element in tag.next_elements:
            if element.name == tagType and element.string != None and element.string != content:
                content = element.string
                Item7.append(content)
                Data.append([lemmatizer.lemmatize(word, pos="n")  for word in word_tokenize(content.lower()) 
                             if lemmatizer.lemmatize(word, pos="n") not in stopwords and not bool(re.search(r"\d", str(lemmatizer.lemmatize(word, pos="n")))) ])
                i += 1 
            if bool(re.search(r"^I[tT][eE][mM].8[\W]", str(element.string))):
                print('Item 7 finished with',len(Item7),'elements')
                break
            if i > constraint:
                break
                
    Data = sum(Data, [])            
    print('All Data with',len(Data),'words')
    return Data,Item1,Item5,Item7

def getContentByText(soup):
    Data,Item1,Item5,Item7 = [],[],[],[]
    lemmatizer = WordNetLemmatizer()    
    text = soup.get_text()

    # Item 1
    Item1_i = [(m.start(0), m.end(0)) for m in re.finditer(r"I[tT][eE][mM].1[\W]", text)][-1]
    Item2_i = [(m.start(0), m.end(0)) for m in re.finditer(r"I[tT][eE][mM].2[\W]", text)][-1]
    Item1   = text[Item1_i[1]:Item2_i[0]]
    
    Data.append([lemmatizer.lemmatize(word, pos="n")  for word in word_tokenize(Item1.lower()) 
                             if lemmatizer.lemmatize(word, pos="n") not in stopwords and not bool(re.search(r"\d", str(lemmatizer.lemmatize(word, pos="n")))) ])
    

    # Item 5    
    Item5_i = [(m.start(0), m.end(0)) for m in re.finditer(r"Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities", text, re.IGNORECASE)][-1]
    Item6_i = [(m.start(0), m.end(0)) for m in re.finditer(r"Selected Financial Data", text, re.IGNORECASE)][-1]
    Item5   = text[Item5_i[1]:Item6_i[0]]
    
    Data.append([lemmatizer.lemmatize(word, pos="n")  for word in word_tokenize(Item5.lower()) 
                             if lemmatizer.lemmatize(word, pos="n") not in stopwords and not bool(re.search(r"\d", str(lemmatizer.lemmatize(word, pos="n")))) ])

    # Item 7
    Item7_i = [(m.start(0), m.end(0)) for m in re.finditer(r"Management’s Discussion and Analysis of Financial Condition and Results of Operations", text, re.IGNORECASE)][-1]
    Item8_i = [(m.start(0), m.end(0)) for m in re.finditer(r"Financial Statements and Supplementary Data", text, re.IGNORECASE)][-1]
    Item7   = text[Item7_i[1]:Item8_i[0]]
    
    Data.append([lemmatizer.lemmatize(word, pos="n")  for word in word_tokenize(Item7.lower()) 
                             if lemmatizer.lemmatize(word, pos="n") not in stopwords and not bool(re.search(r"\d", str(lemmatizer.lemmatize(word, pos="n")))) ])
                  
    Data = sum(Data, [])
    print('Item 1 with',len(Item1),'words')
    print('Item 5 with',len(Item5),'words')
    print('Item 7 with',len(Item7),'words')
    print('All Data with',len(Data),'words')
    return Data,Item1,Item5,Item7    
    

# Read Available Company 10-K Data

In [2]:
Company = []
#Company = ['GOOGL','AMZN','JPM']
Error   = []

with open('./Nomura_Report/Available.pkl', 'rb') as f:
    Company = pickle.load(f)

for company in Company:
    path    = './10-K Data/sec_edgar_filings/'+company
    dic     = {}

    for dirPath, dirNames, fileNames in os.walk(path):
        for f in fileNames:
            if bool(re.search(r"txt$",f)):
                year = re.findall(r"-\d\d-",f)[0]
                year = int('20'+ year[1]+year[2])-1
                dic[year] = os.path.join(dirPath, f)

    for year in dic:
        print('===========',year,''+company+' ===========')
        try:
            file = codecs.open(dic[year], "r", "utf-8")
            soup = BeautifulSoup(file.read(), "lxml")
            Data,Item1,Item5,Item7 = getContentByBS4(soup)
            if len(Data) != 0:
                with open(path+'/'+str(year)+'.pkl', 'wb') as f:
                    pickle.dump(Data, f)
            else:
                print('No Data at',year)
                Error.append([year,company])
        except:
            print('Error at',year)
            Error.append([year,company])


Item 1 finished with 426 elements
Item 5 finished with 179 elements
Item 7 finished with 1162 elements
All Data with 8542 words
Item 1 finished with 469 elements
Item 5 finished with 182 elements
Item 7 finished with 1029 elements
All Data with 9393 words
Item 1 finished with 396 elements
Item 5 finished with 131 elements
Item 7 finished with 1065 elements
All Data with 9005 words
Item 1 finished with 441 elements
Item 5 finished with 171 elements
Item 7 finished with 829 elements
All Data with 9607 words
Item 1 finished with 483 elements
Item 5 finished with 113 elements
Item 7 finished with 809 elements
All Data with 10784 words
Item 1 finished with 423 elements
Item 5 finished with 123 elements
Item 7 finished with 1391 elements
All Data with 12631 words
Item 1 finished with 415 elements
Item 5 finished with 137 elements
Item 7 finished with 1354 elements
All Data with 10685 words
Item 1 finished with 341 elements
Item 5 finished with 144 elements
Item 7 finished with 1320 elements


Item 7 finished with 1633 elements
All Data with 15913 words
Item 1 finished with 796 elements
Item 5 finished with 101 elements
Item 7 finished with 1305 elements
All Data with 13806 words
Item 1 finished with 779 elements
Item 5 finished with 176 elements
Item 7 finished with 1316 elements
All Data with 14219 words
Item 1 finished with 926 elements
Item 5 finished with 156 elements
Item 7 finished with 1333 elements
All Data with 14553 words
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2019
Item 1 finished with 465 elements
Item 5 finished with 133 elements
Item 7 finished with 5 elements
All Data with 11898 words
Item 1 finished with 442 elements
Item 5 finished with 140 elements
Item 7 finished with 5 elements
All Data with 10093 words
Item 1 finished with 398 elements
Item 5 finished with 115 elements
Item 7 finished with 5 elements
All

Item is not in <div>
Item is not in <p>
Item 1 finished with 0 elements
Item is not in <div>
Item is not in <p>
Item 5 finished with 0 elements
Item is not in <div>
Item is not in <p>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2009
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2005
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2006
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2015
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 eleme

Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item is not in <p>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2017
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item is not in <p>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2018
Item is not in <div>
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 14712 words
Item 1 finished with 1367 elements
Item 5 finished with 111 elements
Item 7 finished with 2636 elements
All Data with 13501 words
Item 1 finished with 1395 elements
Item 5 finished with 181 elements
Item 7 finished with 2675 elements
All Data with 13646 words
Item 1 finished with 14 elements
Item 5 finished with 194 elements
Item 7 finished with 2898 elements
All Data with 4733 words
Item 1 finished with 653 ele

Item 1 finished with 567 elements
Item 5 finished with 107 elements
Item 7 finished with 1059 elements
All Data with 6068 words
Item 1 finished with 579 elements
Item 5 finished with 106 elements
Item 7 finished with 945 elements
All Data with 6170 words
Item 1 finished with 571 elements
Item 5 finished with 45 elements
Item 7 finished with 950 elements
All Data with 6172 words
Item 1 finished with 579 elements
Item 5 finished with 43 elements
Item 7 finished with 784 elements
All Data with 6001 words
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2018
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 elements
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2019
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item 5 finished with 0 el

Item 1 finished with 207 elements
Item 7 finished with 1157 elements
All Data with 19148 words
Item 1 finished with 211 elements
Item 7 finished with 1164 elements
All Data with 15536 words
Item 1 finished with 215 elements
Item 7 finished with 1329 elements
All Data with 19905 words
Item 1 finished with 200 elements
Item 5 finished with 63 elements
Item 7 finished with 1179 elements
All Data with 6608 words
Item is not in <div>
Item is not in <p>
Item 1 finished with 0 elements
Item is not in <div>
Item is not in <p>
Item 5 finished with 0 elements
Item is not in <div>
Item is not in <p>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2004
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item is not in <div>
Item 7 finished with 0 elements
All Data with 0 words
No Data at 2005
Item is not in <div>
Item 1 finished with 0 elements
Item is not in <div>
Item is not in <div>
Item is not in <p>
Item 7 finished with 0 elements
All Data with 0 words
N

In [8]:
for tagget_year, company in Error:
    path    = './10-K Data/sec_edgar_filings/'+company
    dic     = {}

    for dirPath, dirNames, fileNames in os.walk(path):
        for f in fileNames:
            if bool(re.search(r"txt$",f)):
                year = re.findall(r"-\d\d-",f)[0]
                year = int('20'+ year[1]+year[2])-1
                dic[year] = os.path.join(dirPath, f)

    print('===========',tagget_year,''+company+' ===========')
    try:
        file = codecs.open(dic[tagget_year], "r", "utf-8")
        soup = BeautifulSoup(file.read(), "lxml")
        Data,Item1,Item5,Item7 = getContentByText(soup)
        if len(Data) != 0:
            with open(path+'/'+str(year)+'.pkl', 'wb') as f:
                pickle.dump(Data, f)
        else:
            print('No Data at',tagget_year)
    except:
        print('Error at',tagget_year)

Error at 2015
Error at 2016
Error at 2017
Error at 2018
Error at 2019
Item 1 with 0 words
Item 5 with 3700 words
Item 7 with 47854 words
All Data with 4230 words
Item 1 with 0 words
Item 5 with 90040 words
Item 7 with 192640 words
All Data with 22234 words
Item 1 with 0 words
Item 5 with 86543 words
Item 7 with 195770 words
All Data with 23051 words
Item 1 with 0 words
Item 5 with 69876 words
Item 7 with 169800 words
All Data with 18680 words
Item 1 with 0 words
Item 5 with 2206 words
Item 7 with 139620 words
All Data with 11035 words
Error at 2019
Item 1 with 0 words
Item 5 with 257342 words
Item 7 with 76311 words
All Data with 25881 words
Item 1 with 0 words
Item 5 with 944151 words
Item 7 with 72459 words
All Data with 82705 words
Item 1 with 0 words
Item 5 with 9050 words
Item 7 with 50499 words
All Data with 4531 words
Item 1 with 0 words
Item 5 with 8512 words
Item 7 with 50900 words
All Data with 4547 words
Item 1 with 0 words
Item 5 with 7789 words
Item 7 with 45170 words
All 