In [2]:
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import re
import pandas as pd
import polars as pl
import time
import os
from db_conn.connection import *
import numpy as np

In [None]:
%%time
rows = []
df = pd.DataFrame()

for file_name in os.listdir('./html_source/subchapters'):
    title = None
    with open(f'./html_source/subchapters/{file_name}', 'rb') as f:
        soup = BeautifulSoup(f, 'lxml')

    try:
        title = soup.find('title').text
    except:
        title = None

    for p in soup.find_all('p'):
        if p and p.get('class'):
            clas = p['class'][0]
            html = p
        else:
            clas = None
            html = None

        row = pd.DataFrame({
            'file_name': [file_name],
            'title': [title],
            clas: [html]
            })
        rows.append(row)

def batch_concat(dfs, batch_size=10000):
    batches = [
        pd.concat(dfs[i:i+batch_size], ignore_index=True)
        for i in range(0, len(dfs), batch_size)
    ]
    return pd.concat(batches, ignore_index=True)

df = batch_concat(rows)

In [3]:
%%time
# need to convert to str just for writing to postgres
# df.astype(str).replace('nan', None).to_sql('raw_fam_html', engine(), if_exists='replace', index=False)
df = pd.read_sql('select * from raw_fam_html', engine())

CPU times: user 1.53 s, sys: 441 ms, total: 1.97 s
Wall time: 2.13 s


In [4]:
df = df.dropna(axis=1, thresh=10) # Drop columns that have less than 10 non-null values

# drop other unneeded columns
df = df.drop('FAMTable10pt', axis = 1)
df = df.drop('FAMTable10pt3ptBeforeAfter', axis = 1)
df = df.drop('MsoFooter', axis = 1)
df = df.drop('MsoNormal', axis = 1)
df = df.drop('8ptspacer', axis = 1)
df = df.drop('FAMBullet', axis = 1)
df = df.drop('FAMTableBullet', axis = 1)
df = df.drop('HeaderFooterClassificationIndicator', axis = 1)

In [5]:
df['body_text'] = (
    df['FAMBodyTextabc']
        .fillna(df['FAMBodyText123'])
        .fillna(df['FAMBodyTexta'])
        .fillna(df['FAMBodyTextA0'])
        .fillna(df['FAMBodyText'])
        .fillna(df['FAMBodyText1230'])
        .fillna(df['FAMBodyTableText'])
        .fillna(df['FAMBodyTexti'])
        .fillna(df['FAMBodyBlockquote'])
        .fillna(df['TableText11FL'])
)

In [7]:
df['header'] = (
        df['FAMHeading20SubchapterNumber']
        .fillna(df['FAMHeading20CxSpFirst'])
        .fillna(df['FAMHeading20CxSpLast'])
        # .fillna(df['FAMCTLineCentered']) fillna with the ctline flush below
        .fillna(df['FAMHeading18'])
        # .fillna(df['FAMCTLineFlush']) this needs to be a seperate column
        .fillna(df['FAMHeading16'])
        .fillna(df['FAMHeading14'])
        .fillna(df['FAMHeading22'])
        .fillna(df['FAMHeading20'])
        .fillna(df['FAMHeading20SubchapterNumberCxSpFirst'])
        .fillna(df['FAMHeading20SubchapterNumberCxSpLast'])
        .fillna(df['FAMHeadingExhibit'])
)

In [8]:
# I think its important to parse the html before front filling

# Extract the 'a' tag text and remaining 'p' tag text separately
def split_p_and_a(html_str):

    if html_str:
        soup = BeautifulSoup(html_str, 'html')
        p = soup.find('p')
        
        if p:
            a_text = p.a.get_text(strip=True) if p.a else None

            if p.a:
                p.a.extract() # Remove the 'a' tag so only 'p' remains

            p_text = p.get_text(strip=True) # Get p text after a is removed

            return a_text, p_text

        return None, None

    return None, None

df[['header_a_text', 'header_p_text']] = df['header'].astype(str).apply(lambda x: pd.Series(split_p_and_a(x)))
df[['body_a_text', 'body_p_text']] = df['body_text'].astype(str).apply(lambda x: pd.Series(split_p_and_a(x)))

In [None]:
df.astype(str).replace('nan', None).replace('None', None).to_sql('raw_fam_parsed', engine(), if_exists='replace', index=False)

In [None]:
# need to figure out how to not overfill
df['FAMHeading20SubchapterNumber'] = df['FAMHeading20SubchapterNumber'].ffill()
df['FAMHeading20CxSpFirst'] = df['FAMHeading20CxSpFirst'].ffill()
df['FAMHeading20CxSpLast'] = df['FAMHeading20CxSpLast'].ffill()
df['FAMCTLineCentered'] = df['FAMCTLineCentered'].ffill()
df['FAMHeading18'] = df['FAMHeading18'].ffill()
df['FAMCTLineFlush'] = df['FAMCTLineFlush'].ffill()
df['FAMHeading16'] = df['FAMHeading16'].ffill()
df['FAMHeading14'] = df['FAMHeading14'].ffill()
df['FAMHeading22'] = df['FAMHeading22'].ffill()
df['FAMHeading20'] = df['FAMHeading20'].ffill()
df['FAMHeading20SubchapterNumberCxSpFirst'] = df['FAMHeading20SubchapterNumberCxSpFirst'].ffill()
df['FAMHeading20SubchapterNumberCxSpLast'] = df['FAMHeading20SubchapterNumberCxSpLast'].ffill()
df['FAMBullet'] = df['FAMBullet'].ffill()
df['FAMHeadingExhibit'] = df['FAMHeadingExhibit'].ffill()