In [3]:
import re
from bs4 import BeautifulSoup as bs
from functools import reduce
from scrap_filings_items import find_filings_paths
import pandas as pd

In [4]:
file_path = "./data/10-K/19/0000001750/0001047469-19-004266/filing-details.html"

In [5]:
with open(file_path, "r") as f:
    soup = bs(f, "html.parser")

In [6]:
# def find_part_start_indice(part_index, doc):
#     if part_index == 1:
#         return find_pattern_indices("PART I[\s\n]", doc)[-1]
#     elif part_index == 2:
#         return find_pattern_indices("PART II[\s\n]", doc)[-1]
#     elif part_index == 3:
#         return find_pattern_indices("PART III[\s\n]", doc)[-1]
#     elif part_index == 4:
#         return find_pattern_indices("PART IV[\s\n]", doc)[-1]
#     else:
#         raise Exception(f"Part index should be in range 1 to 4, got {part_index}")

# def find_pattern_indices(pattern, doc):
#     results = re.finditer(pattern, text)
#     return [m.start(0) for m in results]

In [24]:
class TenKScraper:
    """
    Scraper to extract items from 10-K, here we use section name to represent item name
    """
    # mapping from parts index to section name
    __PARTS_MAPPING_SECTIONS__ = {
        1: ["1", "1A", "1B", "2", "3", "4"],
        2: ["5", "6", "7", "7A", "8", "9", "9A", "9B"],
        3: ["10", "11", "12", "13", "14"],
        4: ["15", "16"],
    }
    
    # mapping from section name to parts index
    __SECTION_MAPPING_PARTS__ = {s: k for (k, v) in __PARTS_MAPPING_SECTIONS__.items() for s in v}
    
    # all items in 10k
    __ALL_ITEMS__ = reduce(lambda x, y: x + y, __PARTS_MAPPING_SECTIONS__.values())
    
    # patterns for each section
    __SECTION_PATTENS__ = {
        "1": [
            """[^"“']item[\s]*1[\.\s]*Business""", 
            """[^"“']item[\s]*1[\.\s]*DESCRIPTION[\s]+OF[\s]+BUSINESS"""
             ],
        "1A": ["""[^"“']item[\s]*1A[\.\s]*Risk[\s-]+Factors"""],
        "1B": ["""[^"“']item[\s]*1B[\.\s]*Unresolved[\s]+Staff[\s]+Comments"""],
        "2":["""[^"“']item[\s]*2[\.\s]*Properties"""],
        "7": [
            """[^"“']ITEM[\s]*7[\.\s]*MANAGEMENT[’']S[\s]+DISCUSSION[\s]+AND"""
        ],
        "7A": [
            """[^"“']ITEM[\s]*7A[\.\s]*Quantitative[\s]+and[\s]+Qualitative[\s]+Disclosure[s\s]+about[\s]+Market[\s]+Risk""",
            """[^"“']ITEM[\s]*7A[\.\s]*Quantitative[\s]+and[\s]+Qualitative[\s]+Disclosures[\s]+about[\s]+Market[\s]+and[\s]+business[\s]+Risks"""
        ],
        "8": ["""[^"“']ITEM[\s]*8[\.\s]*Financial[\s]+Statements"""]
    }
    
    def __init__(self):
        self.doc = None
    
    # Helper function to check if operation can be performed
    def _safty_check(self):
        if self.doc is None:
            raise Exception("No document found, please attach it to scraper")
    
    # Locate positions of all found matches for given pattern
    @staticmethod
    def _find_pattern_indices(pattern, txt, ignore_case=False):
#         print(f"Searching pattern: {pattern}")
        if not ignore_case:
            results = re.finditer(pattern, txt)
        else:
            results = re.finditer(pattern, txt, re.IGNORECASE)
        return [m.start(0) for m in results]
    

    def find_next_section(self, current_section):
        if not current_section in self.__ALL_ITEMS__:
            raise Exception(f"Unknown item name, available items are {cls.__ALL_ITEMS__}")
        
        inc = 1
        
        while self.__ALL_ITEMS__.index(current_section) + inc < len(self.__ALL_ITEMS__):
            next_section = self.__ALL_ITEMS__[self.__ALL_ITEMS__.index(current_section) + inc]
            if re.search(f"item[\.\s]*{next_section}", self.doc, re.IGNORECASE):
                break
            else:
                inc += 1

        return self.__ALL_ITEMS__[self.__ALL_ITEMS__.index(current_section) + inc]
    
    
    def _find_section_start_indice(self, section, part_text):
        section_pattern = self.__SECTION_PATTENS__[section]
        
        section_start_indice = None
        
        for p in section_pattern:
            section_start_indice = self._find_pattern_indices(p, part_text, ignore_case=True)
            if len(section_start_indice) > 0:
                break
        
#         print(section_start_indice)
        if section_start_indice:
            return section_start_indice[-1]
        else:
            print(f"Can not found {section}.")
            return -1

    def preprocessing_doc(self, doc):
        self.doc = doc
        #.replace("\xa0", " ").replace("&nbsp;", " ").replace("\ufeff", " ")
    
    def scrape(self, doc, section):
        # replace some tedious characters
        self.preprocessing_doc(doc)
        
        # find next section using the hard coded mapping, so that we find the start index of current section and
        # next section, the desired content would locate in this range
        next_section = self.find_next_section(section)
        
        # find the indices of each section
        # 1. current section
        current_section_start_indice = self._find_section_start_indice(section, self.doc)
        
        # 2. next section
        next_section_start_indice = self._find_section_start_indice(next_section, self.doc)
        
        if current_section_start_indice == -1 or next_section_start_indice == -1:
            return None
        
        # extract out desired text
        start = current_section_start_indice 
        end = next_section_start_indice
        
        return self.doc[start:end]
        

In [25]:
paths = find_filings_paths(year="21")

In [26]:
scraper = TenKScraper()
target_items = ["1A", "7"]

results = {
    "path": [],
}

for item in target_items:
    results[item] = []
    
for idx, p in enumerate(paths[:10]):
    print(f"{p} -- {idx + 1} / {len(paths[:10])}")
    try:
        with open(p, "r") as f:
            text = bs(f, "html.parser").text
            for item in target_items:
                try:
                    res = scraper.scrape(text, item)
                    results[item].append(res)
                except Exception as e:
                    results[item].append(None)
            results["path"].append(p)
    except Exception as e:
        print(e)

df = pd.DataFrame(results)

df.to_csv("10K-1A_7(2021).csv")

./data/10-K/21/0001415684/0001493152-21-007088/filing-details.html -- 1 / 10
./data/10-K/21/0000882095/0000882095-21-000008/filing-details.html -- 2 / 10
./data/10-K/21/0000891482/0000891482-21-000020/filing-details.html -- 3 / 10
./data/10-K/21/0001526520/0001564590-21-006939/filing-details.html -- 4 / 10
./data/10-K/21/0000749647/0001493152-21-006382/filing-details.html -- 5 / 10
./data/10-K/21/0001803977/0001065949-21-000040/filing-details.html -- 6 / 10
./data/10-K/21/0001370450/0001558370-21-003104/filing-details.html -- 7 / 10
./data/10-K/21/0000910329/0001558370-21-001985/filing-details.html -- 8 / 10
./data/10-K/21/0001514416/0001514416-21-000092/filing-details.html -- 9 / 10
./data/10-K/21/0001607678/0001564590-21-006406/filing-details.html -- 10 / 10


In [14]:
df = pd.DataFrame(results)

df.to_csv("10K-1A_7(2020).csv")

In [29]:
scraper = TenKScraper()

with open("./data/10-K/21/0001514416/0001514416-21-000092/filing-details.html", "r") as f:
    text = bs(f, "html.parser").get_text()
    

In [30]:
scraper.scrape(text, "7")#.split()

"sItem 7. Management’s Discussion and Analysis of Financial Condition and Results of OperationsThe following discussion and analysis of our financial condition and results of operations should be read in conjunction with our consolidated financial statements and related notes that are included elsewhere in this Annual Report on Form 10-K. This discussion contains forward-looking statements based upon current plans, expectations and beliefs that involve risks and uncertainties. Our actual results may differ materially from those anticipated in these forward-looking statements as a result of various factors, including those set forth under “Risk Factors” in this Annual Report on Form 10-K. Our fiscal year ends on December 31.OverviewWe are a leading global enterprise cloud communications company. Our solutions include a broad range of software Application Programming Interfaces (“APIs”) for voice, messaging and emergency services. Our sophisticated and easy-to-use software APIs allow ent

In [586]:
re.search("""[^"“']ITEM[\s]*7A[\.\s]*Quantitative[\s]+and[\s]+Qualitative[\s]+Disclosure[s\s]+about[\s]+Market[\s]+Risk""", text, re.IGNORECASE).start()

3554

## Results

In [619]:
result_19_10k = pd.read_csv("10K-1A_7(2019).csv")

In [620]:
result_19_10k

Unnamed: 0.1,Unnamed: 0,path,1A,7
0,0,./data/10-K/19/0000882095/0000882095-19-000006...,3Item 1A. RISK FACTORSIn evaluating our busine...,8ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS...
1,1,./data/10-K/19/0000891482/0000891482-19-000006...,0Item 1A. Risk Factors. An investment in our ...,Item 7. Management’s Discussion and Analysis ...
2,2,./data/10-K/19/0001526520/0001564590-19-003889...,Item 1A. Risk Factors You should consider c...,Item 7.Management’s Discussion and Analysis o...
3,3,./data/10-K/19/0001390844/0001390844-19-000004...,sITEM 1A. RISK FACTORS Any of the following...,sITEM 7.MANAGEMENT'S DISCUSSION AND ANALYSIS O...
4,4,./data/10-K/19/0000356037/0000356037-19-000049...,Item 1A. Risk Factors If any of the ris...,Item 7. Management’s Discussion and Analy...
...,...,...,...,...
3176,3176,./data/10-K/19/0001602143/0001493152-19-006430...,Item 1A. Risk Factors Not applicable to sma...,Item 7. Management’s Discussion and Analysis ...
3177,3177,./data/10-K/19/0000047111/0000047111-19-000010...,5Item 1A.RISK FACTORSCautionary Note Regarding...,6Item 7.MANAGEMENT’S DISCUSSION AND ANALYSIS O...
3178,3178,./data/10-K/19/0001689066/0001477932-19-001148...,ITEM 1A. RISK FACTORS We are a smaller repo...,ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...
3179,3179,./data/10-K/19/0000746210/0000746210-19-000009...,,Item 7. Management’s Discussion and Analysis ...


In [636]:
len(result_19_10k[result_19_10k["1A"].isna() | result_19_10k["7"].isna()])/len(result_19_10k)

0.13171958503615216

In [637]:
result_20_10k = pd.read_csv("10K-1A_7(2020).csv")
len(result_20_10k[result_20_10k["1A"].isna() | result_20_10k["7"].isna()])/len(result_20_10k)

0.13699936427209156

In [642]:
result_21_10k = pd.read_csv("10K-1A_7(2021).csv")
len(result_21_10k[result_21_10k["1A"].isna() | result_21_10k["7"].isna()])/len(result_21_10k)

0.1495124593716143

In [639]:
# the results of scraping which use html tags to find items
result_htmltag = pd.read_csv("19-10k-item1A-item7.csv")
len(result_htmltag[result_htmltag["Item 1A to Item 1B"].isna() | result_htmltag["Item 7 to Item 7A"].isna()])/len(result_htmltag)

0.39