# Appropriations Bill Scraper
This notebook downloads the appropriations bill from the US Congress website, then extracts the raw text from the html files.


In [None]:
# Import Libraries
from bs4 import BeautifulSoup
import os
import requests

In [None]:
# Download a file at the given url and save to output path
def download_file(url, output_path):
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        # Ensure the directory exists (optional, but recommended)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Open the file with write-binary mode and write the contents of the response
        with open(output_path, 'wb') as file:
            file.write(response.content)
    else:
        print(f"Failed to download {url}")

# Download the HTML files
def download_html_files():

    # Base URL
    base_url = "https://www.govinfo.gov/content/pkg/"

    # Create a directory to store the HTML files
    os.makedirs('html_files', exist_ok=True)
    
    # Download the HTML files
    for i in range(1, 42):
        
        url = base_url + f"USCODE-2022-title{i}/html/USCODE-2022-title{i}.htm"
        output_path = f'html_files/USCODE-2022-title{i}.htm'
        
        print(f"Downloading {url}...")
        download_file(url, output_path)

# Function to extract text
def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove script and style elements
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    
    # Get text
    text = soup.get_text()
    
    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # Drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    return text

# Extract text from the HTML files
def extract_text_from_html_files():
    # Create a directory to store the text files
    os.makedirs('text_files', exist_ok=True)
    
    # Extract text from the HTML files
    for i in range(1, 42):
        file_path = f'html_files/USCODE-2022-title{i}.htm'
        text = extract_text_from_html(file_path)
        with open(f'text_files/USCODE-2022-title{i}.txt', 'w', encoding='utf-8') as file:
            file.write(text)

In [None]:
# Download the HTML files
download_html_files()

In [None]:
# Extract the raw text from the HTML files
extract_text_from_html_files()