In [15]:
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import re
import chardet
import pandas as pd
import time
import os
from db_conn.connection import *

In [16]:
async def download_toc_html():

    with open('./html_source/home.html', 'r') as f:
        soup = BeautifulSoup(f)

    hrefs = []
    for x in soup.find_all('a'):
        if re.search(r'details/\d\dfa[mh]', x.get('href'), re.IGNORECASE):
            hrefs.append(x.get('href'))

    hrefs = set(hrefs)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        for x in hrefs:
            await page.goto(f'https://fam.state.gov{x}')
            time.sleep(1) # i think this is necessary for all the html to load... although I thought using await was supposed to prevent that....
            source = await page.content()
            soup = BeautifulSoup(source)
            with open(f'./html_source/toc/{x.split('/')[-1]}.html', 'w') as f:
                f.write(soup.prettify())



def get_subchapter_hrefs():
    hrefs = []
    for x in os.listdir('./html_source/toc/'):
        with open(f'./html_source/toc/{x}', 'r') as f:
            soup = BeautifulSoup(f)
            
        for a in soup.find_all('a'):
            link = a.get('href')

            if re.search(r'\d+FAM/\d+FAM\d+.html', link, re.IGNORECASE) or re.search(r'/FAM/\d+FAH\d+/\d+FAH\d+.html', link, re.IGNORECASE):
                hrefs.append('https://fam.state.gov'+link)
    
    df = pd.DataFrame({'url': hrefs}).sort_values(by='url').reset_index(drop=True)
    df.to_csv('urls.csv')



async def download_subchapter_html():
    df = pd.read_csv('urls.csv')
    urls = df.url.to_list()
    total = len(urls)
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        for i, url in enumerate(urls):
            await page.goto(url)
            source = await page.content()
            soup = BeautifulSoup(source)
            with open(f'./html_source/subchapters/{url.split('/')[-1]}', 'w') as f:
                print(f'downloading ({i} / {total}) {url}')
                f.write(soup.prettify())


In [None]:
rows = []
files = [x for x in os.listdir('./html_source/subchapters') if x != '.DS_Store']

for x in files[:10]:
    # print(x)
    with open(f'./html_source/subchapters/{x}', 'rb') as f:
        soup = BeautifulSoup(f)

    _title = soup.find('title').text
    _22 = None
    _20_1 = None
    _20 = None
    _18 = None
    _16 = None
    _14 = None
    _body = None

    for y in soup.find_all('p'):
        
        if y.get('class')[0] == 'FAMHeading22':
            _22 = y.text
            _20_1 = None
            _20 = None
            _18 = None
            _16 = None
            _14 = None
            _body = None

        if y.get('class')[0] == 'FAMHeading20SubchapterNumber':
            _20_1 = y.text
            _20 = None
            _18 = None
            _16 = None
            _14 = None
            _body = None

        if 'FAMHeading20' in y.get('class')[0]: #and 'Subchapter' not in y.get('class')[0]:
            _20 = y.text
            _18 = None
            _16 = None
            _14 = None
            _body = None

        if y.get('class')[0] == 'FAMHeading18':
            _18 = y.text
            _16 = None
            _14 = None
            _body = None

        if y.get('class')[0] == 'FAMHeading16':
            _16 = y.text
            _14 = None
            _body = None

        if y.get('class')[0] == 'FAMHeading14':
            _14 = y.text
            _body = None


        if 'BodyText' in y.get('class')[0]:
            _body = y.text

        row = pd.DataFrame({
                'file_name':        [x],
                'title':            [_title],
                'heading_22':       [_22],
                'heading_20_scn':   [_20_1],
                'heading_20':       [_20],
                'heading_18':       [_18],
                'heading_16':       [_16],
                'heading_14':       [_14],
                'body_text':        [_body]
            })

        rows.append(row)


In [13]:
df = pd.concat(rows, ignore_index=True)

In [14]:
df

Unnamed: 0,file_name,title,heading_22,heading_20_scn,heading_20,heading_18,heading_16,heading_14,body_text
0,04FAM0830.html,\n 4 FAM 830 EMERGENCY EVACUATION FISCAL POL...,,,,,,,
1,04FAM0830.html,\n 4 FAM 830 EMERGENCY EVACUATION FISCAL POL...,,\n\n 4 FAM 830\n \n,\n\n 4 FAM 830\n \n,,,,
2,04FAM0830.html,\n 4 FAM 830 EMERGENCY EVACUATION FISCAL POL...,,\n\n 4 FAM 830\n \n,\n EMERGENCY EVACUATION\n,,,,
3,04FAM0830.html,\n 4 FAM 830 EMERGENCY EVACUATION FISCAL POL...,,\n\n 4 FAM 830\n \n,\n FISCAL POLICY\n,,,,
4,04FAM0830.html,\n 4 FAM 830 EMERGENCY EVACUATION FISCAL POL...,,\n\n 4 FAM 830\n \n,\n FISCAL POLICY\n,,,,
...,...,...,...,...,...,...,...,...,...
878,14FAH050120.html,\n 14 FAH-5 H-120 DPO MAILROOM FACILITIES\n,,\n\n 14 FAH-5 H-120\n \n,\n dpo Mailroom facilitieS\n,\n\n 14 FAH-5 H-122\n \n THROUGH H-1...,,,
879,14FAH050120.html,\n 14 FAH-5 H-120 DPO MAILROOM FACILITIES\n,,\n\n 14 FAH-5 H-120\n \n,\n dpo Mailroom facilitieS\n,\n\n 14 FAH-5 H-122\n \n THROUGH H-1...,,,
880,14FAH050120.html,\n 14 FAH-5 H-120 DPO MAILROOM FACILITIES\n,,\n\n 14 FAH-5 H-120\n \n,\n dpo Mailroom facilitieS\n,\n\n 14 FAH-5 H-122\n \n THROUGH H-1...,,,
881,14FAH050120.html,\n 14 FAH-5 H-120 DPO MAILROOM FACILITIES\n,,\n\n 14 FAH-5 H-120\n \n,\n dpo Mailroom facilitieS\n,\n\n 14 FAH-5 H-122\n \n THROUGH H-1...,,,


In [5]:
df['header'] = df['heading_14'].fillna(df['heading_16']).fillna(df['heading_18']).fillna(df['heading_20'])

In [6]:
df

Unnamed: 0,file_name,heading_22,heading_20_scn,heading_20,heading_18,heading_16,heading_14,body_text,header
0,04FAM0830.html,,,,,,,,
1,04FAM0830.html,,\n\n 4 FAM 830\n \n,\n\n 4 FAM 830\n \n,,,,,\n\n 4 FAM 830\n \n
2,04FAM0830.html,,\n\n 4 FAM 830\n \n,\n EMERGENCY EVACUATION\n,,,,,\n EMERGENCY EVACUATION\n
3,04FAM0830.html,,\n\n 4 FAM 830\n \n,\n FISCAL POLICY\n,,,,,\n FISCAL POLICY\n
4,04FAM0830.html,,\n\n 4 FAM 830\n \n,\n FISCAL POLICY\n,,,,,\n FISCAL POLICY\n
...,...,...,...,...,...,...,...,...,...
249828,03FAH012110.html,\n\n 3 FAH-1 H-2100\n \n\n EMPLOYMEN...,\n\n 3 FAH-1 H-2110\n \n,\n Drug-Free Workplace\n,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...,,,\n\n XV. LIST OF POSITIONS DESIGNATED FO...,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...
249829,03FAH012110.html,\n\n 3 FAH-1 H-2100\n \n\n EMPLOYMEN...,\n\n 3 FAH-1 H-2110\n \n,\n Drug-Free Workplace\n,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...,,,\n\n XV. LIST OF POSITIONS DESIGNATED FO...,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...
249830,03FAH012110.html,\n\n 3 FAH-1 H-2100\n \n\n EMPLOYMEN...,\n\n 3 FAH-1 H-2110\n \n,\n Drug-Free Workplace\n,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...,,,\n\n XV. LIST OF POSITIONS DESIGNATED FO...,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...
249831,03FAH012110.html,\n\n 3 FAH-1 H-2100\n \n\n EMPLOYMEN...,\n\n 3 FAH-1 H-2110\n \n,\n Drug-Free Workplace\n,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...,,,\n A complete list of all TDP positions by ...,\n\n 3 FAH-1 H-2112\n \n THROUGH H-2...


In [None]:
df = df.replace(r'\n', '', regex=True)

In [None]:
df.to_sql('raw_fam_parsed', engine(), if_exists='replace', index=False)