## Research Project 2
---
```text
- Source: SEC
- Goal: Extract information from HTML tables
- Techniques: HTML parsing, NER, Dataframes
- Tools: Beautifulsoup, pandas, spacy
- Lines of code: ~100```

In [2]:
# Standard library
import datetime

# Third-party
import re
import spacy
import requests
import pandas as pd
from lxml import html, etree
from bs4 import BeautifulSoup, Tag

NLP = spacy.load('en_core_web_sm')

def parse(table):
    table_string = re.sub(r'<br/?>', '\n', str(etree.tostring(table)), 
                          flags=re.DOTALL)
    new_table = BeautifulSoup(table_string, 'html.parser')
    row_ind, col_ind = 0, 0
    output = []
    for row in new_table.find_all('tr'):
        smallest_row_span = 1
        for cell in row.children:
            if cell.name in ('td', 'th'):
                span = cell.get('rowspan')
                row_span = int(span) if span else 1
                smallest_row_span = min(smallest_row_span, row_span)
                span = cell.get('colspan')
                col_span = int(span) if span else 1
                while True:
                    if row_ind >= len(output) or \
                            col_ind >= len(output[row_ind]) \
                            or output[row_ind][col_ind] is None:
                        break
                    col_ind += 1
                for ii in range(row_ind, row_ind + row_span):
                    for jj in range(col_ind, col_ind + col_span):
                        while ii >= len(output):
                            output.append([])
                        while jj >= len(output[ii]):
                            output[ii].append(None)
                        if output[ii][jj] is None:
                            output[ii][jj] = str(cell.get_text())
                col_ind += col_span
        row_ind += smallest_row_span
        col_ind = 0
    output = [[i.replace('\xa0', ' ').strip() for i in j] for j in 
              output]
    output = [i for i in output if any(bool(j) for j in i)]
    output[0] = [' '.join(i.split()).strip() for i in output[0]]
    output[0] = [i.replace('\\n', ' ').strip() for i in output[0]]
    output[1:] = [[i.replace(r'\n', ', ').strip() for i in j] 
                  for j in output[1:]]
    output = [[i if i else None for i in j] for j in output]
    return output

def get_tables_from_url(url):
    res = requests.get(url)
    tree = html.fromstring(res.content)
    return tree.xpath('//table')

def parse_comp_tables(tables):
    dfs = []
    for table in tables:
        parsed = parse(table)
        if parsed and parsed[0] and parsed[0][0] and \
                re.findall(r'^name', parsed[0][0], 
                           flags=re.IGNORECASE|re.DOTALL):
            df = pd.DataFrame(parsed[1:], columns=parsed[0])
            if df.ix[:,0].iloc[0].replace(',', '') == df.columns[0]:
                df = df.iloc[1:]
            df = df.dropna(axis=1, how='all')
            df.columns = ['Name'] + df.columns.values.tolist()[1:]
            dfs.append(df)
    return dfs

def clean_table_names(dfs):
    mapping = {None: None}

    for df in dfs:
        for name in df['Name'].values:
            if name is None:
                continue
            clean = re.sub(r'\d+', '', name)
            ents = [i.text for i in NLP(clean).ents if i.label_ == 'PERSON']
            if not ents: ents = [name]
            mapping[name] = ents[0]

    for num, df in enumerate(dfs):
        df['Name'] = df['Name'].apply(lambda x: mapping[x])
        for num, row in enumerate(df.iterrows()):
            if row[1]['Name'] is None and num > 0:
                df.iloc[num]['Name'] = df.iloc[num - 1]['Name']
    
    return dfs

urls = [
    '789019/000119312517310951/d461626ddef14a.htm',
    #'320193/000119312516422528/d79474ddef14a.htm'
]

for url in urls:
    full_url = 'https://www.sec.gov/Archives/edgar/data/%s' % url 
    tables = get_tables_from_url(full_url)
    dfs = parse_comp_tables(tables)
    dfs = clean_table_names(dfs)