In [None]:
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import re
import chardet
import pandas as pd
import time
import os
from datetime import datetime
from db_conn.connection import *


async def download_toc_html():
    with open('./html_source/home.html', 'r') as f:
        soup = BeautifulSoup(f)
    hrefs = []
    for x in soup.find_all('a'):
        if re.search(r'details/\d\dfa[mh]', x.get('href'), re.IGNORECASE):
            hrefs.append(x.get('href'))
    hrefs = set(hrefs) 
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        for x in hrefs:
            await page.goto(f'https://fam.state.gov{x}')
            time.sleep(1) # i think this is necessary for all the html to load... although I thought using await was supposed to prevent that....
            source = await page.content()
            soup = BeautifulSoup(source)
            with open(f'./html_source/toc/{x.split('/')[-1]}.html', 'w') as f:
                f.write(soup.prettify())


def get_subchapter_hrefs():
    '''
    Open the table of content html and capture each href to the subchapter page.
    Writes subchapter hrefs to urls.csv
    '''
    hrefs = []
    for x in os.listdir('./html_source/toc/'):
        with open(f'./html_source/toc/{x}', 'r') as f:
            soup = BeautifulSoup(f)
        for a in soup.find_all('a'):
            link = a.get('href')
            if re.search(r'\d+FAM/\d+FAM\d+.html', link, re.IGNORECASE) or re.search(r'/FAM/\d+FAH\d+/\d+FAH\d+.html', link, re.IGNORECASE):
                hrefs.append('https://fam.state.gov'+link)
    df = pd.DataFrame({'url': hrefs}).sort_values(by='url').reset_index(drop=True)
    df.to_csv('urls.csv')


async def download_subchapter_html():
    today = datetime.now().strftime('%m%d%Y')
    try:
        os.mkdir('./html_source/subchapters/05242025')
    except:
        print('folder for today already exists')
    df = pd.read_csv('urls.csv')
    urls = df.url.to_list()
    total = len(urls)
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        for i, url in enumerate(urls):
            if url not in os.listdir(f'./html_source/subchapters/{today}'):
                await page.goto(url)
                source = await page.content()
                soup = BeautifulSoup(source)
                with open(f'./html_source/subchapters/{today}/{url.split('/')[-1]}', 'w') as f:
                    print(f'downloading ({i} / {total}) {url}')
                    f.write(soup.prettify())


In [7]:
await download_toc_html()

In [9]:
get_subchapter_hrefs()

In [31]:
await download_subchapter_html()

folder for today already exists
downloading (0 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0010.html
downloading (1 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0020.html
downloading (2 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0030.html
downloading (3 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0040.html
downloading (4 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0050.html
downloading (5 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0110.html
downloading (6 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0120.html
downloading (7 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0130.html
downloading (8 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0140.html
downloading (9 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0150.html
downloading (10 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0160.html
downloading (11 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0170.html
downloading (12 / 1501) https://fam.state.gov/FAM/01FAM/01FAM0210.html
downloading (13 / 1501) https://fam.state.gov/FAM/01FAM