In [1]:
import cloudscraper
from bs4 import BeautifulSoup
import PyPDF2
import re
import os
import json
import torch
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Reading the target url to find the pdf report and Downloading the target report once found 
target_url = "https://www.congress.gov/crs-appropriations-status-table/2026"
# Looking for the number "119-217" to match the report, number allows for partial matching, avoiding common errors with spaces and formating changes
target_report = "119-217" 
pdf_report = "appropriations-status-table.pdf"

def download_and_save():
    # Initializing Cloudscraper to bypass WAF (Web Application Firewall)
    scraper = cloudscraper.create_scraper() 
    print(f"1. Accessing {target_url}...")
    
    try:
        # Getting to the main page
        response = scraper.get(target_url)
        if response.status_code != 200:
            print(f" Failed to connect. Status code: {response.status_code}")
            return False
            
        print("   Connected to website.")

        # Parsing the HTML to find the PDF link by using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        found_link = None
        
        # Looking for a link containing "119-161"
        for link in soup.find_all('a', href=True):
            if target_report in link.get_text():
                found_link = link['href']
                print(f"   Found Link: {link.get_text().strip()}")
                break
        
        if not found_link:
            print(f"  Error: Could not find a link with text '{target_report}'") 
            return False

        # Fix link if it starts with "/", prepend the base domain to make it a valid URL
        if found_link.startswith("/"):
            found_link = "https://www.congress.gov" + found_link

        # Downloading the actual file
        print(f"2. Downloading PDF from: {found_link}")
        pdf_data = scraper.get(found_link)
        
        # Saving to disk
        with open(pdf_report, 'wb') as f:
            f.write(pdf_data.content)
            
        print(f"   SUCCESS: File saved as '{pdf_report}'")
        return True

    except Exception as e:
        print(f" Crash: {e}")
        return False

if __name__ == "__main__":
    download_and_save()

1. Accessing https://www.congress.gov/crs-appropriations-status-table/2026...
   Connected to website.
   Found Link: H. Rept. 119-217
2. Downloading PDF from: https://www.congress.gov/119/crpt/hrpt217/CRPT-119hrpt217.pdf
   SUCCESS: File saved as 'appropriations-status-table.pdf'


In [3]:
#This step Parse the downloaded PDF and retrieve all instances of the phrase “International Broadcasting Operations” or close variants (e.g., different capitalization, minor word-order changes).
# It also outputs each instance by page numners.

pdf_reportfile = "appropriations-status-table.pdf"
search_phrase = "International Broadcasting Operations"

def clean_text_for_search(text):
    """
    Standardizes text so we can find phrases split by lines.
    """
    if not text: return ""
    # Fixing hyphenation (Inter-\nnational -> International)
    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
    # Fixing newlines (word\nword -> word word)
    text = text.replace('\n', ' ')
    # Removing extra spaces in between
    text = re.sub(r'\s+', ' ', text)
    return text

def verify_matches():
    print(f"1. Loading {pdf_reportfile}...")
    if not os.path.exists(pdf_reportfile):
        print(" Error: File missing.")
        return
    # Compiling regex with IGNORECASE so we match "International" or "international"
    reader = PyPDF2.PdfReader(pdf_reportfile)
    search_pattern = re.compile(re.escape(search_phrase), re.IGNORECASE)
    
    match_count = 0
    print(f"2. Scanning {len(reader.pages)} pages for '{search_phrase}'...")

    for i, page in enumerate(reader.pages):
        raw_text = page.extract_text()
        if not raw_text: continue

        # cleaning the text first before searching
        cleaned_text = clean_text_for_search(raw_text)
        
        # counting matches on this page
        matches = list(search_pattern.finditer(cleaned_text))
        if matches:
            match_count += len(matches)
            print(f"   Found {len(matches)} match(es) on Page {i+1}")

    print(f"\n3. diagnostics complete. Total Matches Found: {match_count}")
    # expected matches is 6 from the manual check performed

if __name__ == "__main__":
    verify_matches()

1. Loading appropriations-status-table.pdf...
2. Scanning 164 pages for 'International Broadcasting Operations'...
   Found 1 match(es) on Page 2
   Found 2 match(es) on Page 28
   Found 1 match(es) on Page 95
   Found 1 match(es) on Page 134
   Found 1 match(es) on Page 141

3. diagnostics complete. Total Matches Found: 6


In [4]:
#This step is for Extract 150 words before and 150 words after the match.
    #Capture the page number on which the instance appears.
    #Identify and extract any numbers within that surrounding text and store them separately (as numerical data, not strings).
    # saving them in json format named as "broadcasting_budget_data.json"

pdf_reportfile= "appropriations_report.pdf"
search_phrase = "International Broadcasting Operations"
json_output = "broadcasting_budget_data.json"

# Number Extraction
def extract_numbers(text):
    """
    Parses unstructured text to identify financial figures and years.
    Returns a list of floats for numerical analysis.
    """
    # \$?          -> Optional dollar sign
    # \d{1,3}      -> Leading digits (1-3 digits)
    # (?:,\d{3})* -> Optional thousands groups (e.g., ,000)
    # (?:\.\d+)?   -> Optional decimal cents (e.g., .50)

    pattern = r'\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?'
    matches = re.findall(pattern, text)
    valid_numbers = []
    for m in matches:
        clean = m.replace('$', '').replace(',', '')
        try:
            valid_numbers.append(float(clean))
        except ValueError:
            continue
    return valid_numbers

# Running the search
def run_pipeline():
    print(f"1. Processing {pdf_reportfile}...")
    reader = PyPDF2.PdfReader(pdf_reportfile)
    results = []
    
    search_pattern = re.compile(re.escape(search_phrase), re.IGNORECASE)

    for i, page in enumerate(reader.pages):
        raw_text = page.extract_text()
        if not raw_text: continue

        # Cleaning text from previous cell
        cleaned_text = clean_text_for_search(raw_text)
        
        # Finding & Extracting
        for match in search_pattern.finditer(cleaned_text):
            
            # Contexting Slicing (150 words)
            all_words = cleaned_text.split()
            match_start_char = match.start()
            
            # Estimating word index from char index
            words_before_list = cleaned_text[:match_start_char].split()
            current_word_idx = len(words_before_list)
            
            # Calculating window
            start = max(0, current_word_idx - 150)
            phrase_len = len(search_phrase.split())
            end = min(len(all_words), current_word_idx + phrase_len + 150)
            
            # Building context string
            words_before = all_words[start : current_word_idx]
            words_after = all_words[current_word_idx + phrase_len : end]
            full_context = " ".join(words_before) + f" [[{match.group()}]] " + " ".join(words_after)
            
            # Extracting Numbers & Save
            entry = {
                "page_number": i + 1,
                "found_phrase": match.group(),
                "extracted_numbers": extract_numbers(full_context),
                "context_text": full_context
            }
            results.append(entry)

    # saving output
    with open(json_output, "w", encoding='utf-8') as f:
        json.dump(results, f, indent=4)
        
    print(f"2. Done! Found {len(results)} items.")
    print(f"   Results saved to: {json_output}")

if __name__ == "__main__":
    run_pipeline()

1. Processing appropriations_report.pdf...
2. Done! Found 6 items.
   Results saved to: broadcasting_budget_data.json


In [5]:
#This is the bonus step to summarize the context around each instance of the phrase "International Broadcasting Operations", specifically whether and how the found numbers affect the budget of that program.
json_output = "broadcasting_budget_data.json"
json_output_aisummary = "broadcasting_budget_with_summary.json"

# Removing all the dots in between
def remove_dot_leaders(text):
    """
    Removes the long rows of dots found in budget tables.
    Example: "Agency .................... $100" -> "Agency $100"
    """
    if not text: return ""
    # Regex: Find 3 or more dots in a row and replace with a single space
    clean_text = re.sub(r'\.{3,}', ' ', text)
    # Cleaning up multiple spaces that might result
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text.strip()

def run_ai_analysis():
    print(f"1. Loading data from {json_output}...")
    
    if not os.path.exists(json_output):
        print(" Error: Input JSON missing.")
        return

    with open(json_output, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"   Loaded {len(data)} records.")

    #Loading AI Model
    print("2. Loading AI Brain (this takes a moment)...")
    try:
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    except Exception as e:
        print(f" Error loading model: {e}")
        return

    print("3. Cleaning text and generating summaries...")
    
    for i, entry in enumerate(data):
        raw_context = entry['context_text']
        
        # Cleanning Steps
        # Removing the [[ ]] markers
        text_no_markers = raw_context.replace("[[", "").replace("]]", "")
        # Removing the ugly dots (................)
        clean_input = remove_dot_leaders(text_no_markers)
        
        # updating the context in the data so final JSON looks pretty
        entry['context_text_clean'] = clean_input 
        
        # Truncating to 1024 chars for the AI
        ai_input = clean_input[:1024]
        
        try:
            # Generating Summary
            summary_output = summarizer(ai_input, max_length=60, min_length=20, do_sample=False)
            summary_text = summary_output[0]['summary_text']
            
            # Post-process summary (sometimes the AI repeats the dots if they were sneaky)
            entry['ai_summary'] = remove_dot_leaders(summary_text)
            
            print(f"   Summarized Record {i+1} (Page {entry['page_number']})")
            
        except Exception as e:
            print(f"Could not summarize Record {i+1}: {e}")
            entry['ai_summary'] = "Error generating summary."

    #saving final json report with ai summary
    with open(json_output_aisummary, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
        
    print(f"\n Final report saved to: {json_output_aisummary}")

if __name__ == "__main__":
    run_ai_analysis()

1. Loading data from broadcasting_budget_data.json...
   Loaded 6 records.
2. Loading AI Brain (this takes a moment)...


Device set to use cpu


3. Cleaning text and generating summaries...
   Summarized Record 1 (Page 2)
   Summarized Record 2 (Page 28)
   Summarized Record 3 (Page 28)
   Summarized Record 4 (Page 95)
   Summarized Record 5 (Page 134)
   Summarized Record 6 (Page 141)

 Final report saved to: broadcasting_budget_with_summary.json
