Author: Sebastian Orozco

In [3]:
"""
Step 1: Load in API keys
"""

import os

NUM_API_KEYS_TO_LOAD = 1000
API_KEYS_FILE_PATH = f"data/keys/keys_{NUM_API_KEYS_TO_LOAD}.txt" 
# Fallback if the specific N-keys file doesn't exist, try a generic one.
FALLBACK_API_KEYS_FILE_PATH = "data/keys/keys.txt" 

loaded_api_keys = []

actual_path_used = API_KEYS_FILE_PATH
if not os.path.exists(API_KEYS_FILE_PATH):
    print(f"File {API_KEYS_FILE_PATH} not found, trying fallback {FALLBACK_API_KEYS_FILE_PATH}")
    actual_path_used = FALLBACK_API_KEYS_FILE_PATH

try:
    with open(actual_path_used, "r") as file:
        loaded_api_keys = [line.strip() for line in file if line.strip()] # Ensure no empty lines are counted
    print(f"Successfully loaded {len(loaded_api_keys)} API keys from {actual_path_used}.")
except FileNotFoundError:
    print(f"Error: API key file not found at {actual_path_used}. Please ensure the file exists")
    # `loaded_api_keys` remains empty, subsequent cells might fail if they depend on these.
except Exception as e:
    print(f"An error occurred while loading API keys: {e}")

# Sanity check: print the num of keys
print(f"Total API keys available: {len(loaded_api_keys)}")

File data/keys/keys_1000.txt not found, trying fallback data/keys/keys.txt
Successfully loaded 217 API keys from data/keys/keys.txt.
Total API keys available: 217


In [4]:
"""
Step 2: Gather headers for all dockets

This section is commented out because we only need to pull all docket headers once
"""

# from comments_downloader import CommentsDownloader

# # Init downloader with a specific API key (replace with a valid key if uncommenting)
# downloader = CommentsDownloader(api_key="[YOUR API KEY]")

# # Define parameters for filtering dockets, e.g., by agency ID
# params = {'filter[agencyId]': 'EPA'}

# # Gather headers for dockets matching the parameters and save to a CSV file
# # This is typically a one-time operation or run infrequently.
# downloader.gather_headers(
#     data_type="dockets", 
#     params=params, 
#     csv_filename="data/EPA_All_Headers_Up_To_2025-02-11.csv"
# )
# print("Header gathering complete.")

'\nStep 2: Gather headers for all dockets\n\nThis section is commented out because we only need to pull all docket headers once\n'

In [5]:
"""
Step 3: Load docket IDs from a CSV file

Reads in all dockets so that we can later fetch comments for each one
"""

from comments_downloader import CommentsDownloader 

HEADERS_CSV_PATH = "data/headers/EPA_All_Headers_Up_To_2025-02-11.csv"

# Init downloader (here we don't need to specify an API key because we're just using a downloader method for local CSV parsing)
downloader_utility = CommentsDownloader(api_key="") 

docket_ids = []
try:
    docket_ids = downloader_utility.get_ids_from_csv(HEADERS_CSV_PATH, data_type="dockets")
    print(f"Successfully loaded {len(docket_ids)} docket IDs from {HEADERS_CSV_PATH}.")
except FileNotFoundError:
    print(f"Error: headers CSV file not found at {HEADERS_CSV_PATH}.")
except Exception as e:
    print(f"An error occurred while loading docket IDs: {e}")

# Sanity check: output total num of loaded docket IDs 
print("Loading in ", len(docket_ids), " many dockets.") 

Successfully loaded 20682 docket IDs from data/headers/EPA_All_Headers_Up_To_2025-02-11.csv.
Loading in  20682  many dockets.


In [None]:
"""
Step 4: Concurrently pull all comments for every docket

Here, we wrap the concurrent downloader in a loop to wait for the API rate limit to reset before fetching comments again
"""

import time
import glob
import shutil
from concurrent_downloader import run_concurrent_downloading_main


chunk_size = 200
chunks = []
for i in range(0, len(docket_ids), chunk_size):
    # print("chunk = ", docket_ids[i:i+chunk_size])
    chunks += [docket_ids[i:i+chunk_size]]

# Let limits reset before use
# time.sleep(60*60)


# each api call to this wrapper
# it starts pulling all the data from the start

# for each docket -> many documents -> many comments


for i, chunk in enumerate(chunks):
    print(f"\n=== Processing chunk {i+1}/{len(chunks)} ({len(chunk)} dockets) ===")

    await run_concurrent_downloading_main(all_docket_ids=chunk, all_api_keys=loaded_api_keys)

    # Cleanup files generated in error by comment_downloader script
    for file in glob.glob("document_headers_*.csv"):
        try:
            os.remove(file)
        except OSError as e:
            print(f"Error deleting {file}: {e}")

    # # Cleanup anything left in temp
    try:
        shutil.rmtree("data/temp_dbs")
    except FileNotFoundError:
        print("data/temp_dbs not found, skipping.")
    except Exception as e:
        print(f"Error deleting data/temp_dbs: {e}")

    # Wait for API rate limits to reset
    print(f"Waiting for 1 hour before next chunk to avoid rate limits...")
    time.sleep(60*60)




=== Processing chunk 1/101 (200 dockets) ===
--- Starting Concurrent Docket Processing ---
Configured to use up to 200 concurrent workers.
Divided 200 dockets into 200 chunks of up to 1 dockets each.

Launching 200 processing tasks...
2025-06-20 23:01:30: Getting documents associated with docket EPA-R06-OAR-2011-0033...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R06-OAR-2014-0700...



2025-06-20 23:01:30: Getting documents associated with docket EPA-R05-OAR-2011-0888...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R05-OAR-2014-0662...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R08-OAR-2013-0801...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R01-OAR-2014-0275...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R05-OAR-2014-0123...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R08-OAR-2014-0811...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R09-OAR-2015-0087...

2025-06-20 23:01:30: Getting documents associated with docket EPA-HQ-OPP-2014-0632...

2025-06-20 23:01:30: Getting documents associated with docket EPA-HQ-OPP-2014-0394...

2025-06-20 23:01:30: Getting documents associated with docket EPA-R09-OAR-2014-0586...

Found 3 documents...Found 4 documents...
Found 8 documents...
Found 2 documents...
Found 3 documents...

Found 9 documents

IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out


******************************
2025-06-21 01:50:42: Getting comments for document EPA-HQ-OPP-2010-0261-0001...

2025-06-21 01:50:42: Getting objectId for document EPA-HQ-OPP-2010-0261-0001...2025-06-21 01:50:42: Getting objectId for document EPA-HQ-OPP-2010-0057-0020...2025-06-21 01:50:42: Writing 5 records to document_headers_014910.csv...DONE retrieving all 0 comments from 16 document(s) for docket EPA-HQ-OPP-2010-0059----------------

2025-06-21 01:50:42: Getting documents associated with docket EPA-HQ-OPP-2010-0703...
2025-06-21 01:50:42: Writing 1 records to document_headers_014910.csv...
BANANADone

2025-06-21 01:50:43: Removing any duplicates in the CSV...Found 0 comments...

Done
2025-06-21 01:50:43: Finished: approximately 0 comments collected2025-06-21 01:50:43: Removing any duplicates in the CSV...
2025-06-21 01:50:43: Done. Removed 0 duplicate rows from document_headers_014910.csv.


Done getting all 0 comments for document EPA-HQ-OPP-2010-0527-0001----------------

2025-06

In [None]:
"""
Placeholder for sequential docket processing (deprecated)
"""

# This is the old sequential way of gathering comments

# # Example (ensure 'downloader' and 'docket_ids' are defined from previous cells):
# # Also, ensure the downloader is initialized with a valid API key.

# SINGLE_API_KEY = loaded_api_keys[0] if loaded_api_keys else "YOUR_FALLBACK_API_KEY"
# # downloader_sequential = CommentsDownloader(api_key=SINGLE_API_KEY)

# i = 0
# for docket_id in docket_ids[:100]: # Example: processing a slice of dockets
#     print(f"Sequentially processing docket: {docket_id}...")
#     try:
#         downloader_sequential = CommentsDownloader(api_key=loaded_api_keys[i])
#         i+=1
#         downloader_sequential.gather_comments_by_docket(
#             docket_id, 
#             db_filename="data/sequential/EPA_Comments_Sequential.db", # Use a different DB name for sequential
#             csv_filename=None # Optional CSV output
#         )
#         print(f"Successfully processed docket {docket_id} sequentially.")
#     except Exception as e:
#         print(f"Error processing docket {docket_id} sequentially: {e}")

# print("Sequential docket processing (if uncommented) would be complete.")

Sequentially processing docket: EPA-R06-OAR-2005-TX-0034...
2025-06-12 00:24:23: Getting documents associated with docket EPA-R06-OAR-2005-TX-0034...

Found 7 documents...
2025-06-12 00:24:24: Inserting 7 records into database...
2025-06-12 00:24:24: Writing 7 records to document_headers_002423.csv...Done
2025-06-12 00:24:24: Removing any duplicates in the CSV...
2025-06-12 00:24:24: Done. Removed 0 duplicate rows from document_headers_002423.csv.
2025-06-12 00:24:24: Finished: approximately 7 documents collected

Done----------------

******************************
2025-06-12 00:24:24: Getting comments for document EPA-R06-OAR-2005-TX-0034-0003...

2025-06-12 00:24:24: Getting objectId for document EPA-R06-OAR-2005-TX-0034-0003...Got it (09000064800a7431)
2025-06-12 00:24:25: Getting comment headers associated with document EPA-R06-OAR-2005-TX-0034-0003...

Found 0 comments...
2025-06-12 00:24:26: Finished: approximately 0 comments collected

Done getting all 0 comments for document E

In [None]:
# The general workflow after gathering comment metadata (which may include PDF URLs) is:
# 1. Identify comments with PDF attachments from the gathered data (e.g., from CSVs in 'documents' directory).
# 2. For each PDF:
#    a. Download the PDF file (if `download_pdfs=True` in PDFTextExtractor).
#    b. Extract text content from the PDF.
# 3. Store the extracted text, associating it with the original comment.
#    (e.g., in new CSV files, text files, or by updating the database).

In [None]:
"""
Placeholder for preparing list of CSV files containing comment data for PDF text extraction (deprecated)
"""

# This is the old way of collecting comments when they were spread across multiple CSVs, we can now collect them diretly by visiting the attachment links for each comment in the db

import os

# Configuration: Directory containing CSV files with comment metadata
COMMENTS_CSV_DIR = "documents"

csv_files_for_extraction = []
if os.path.isdir(COMMENTS_CSV_DIR):
    csv_files_for_extraction = [f for f in os.listdir(COMMENTS_CSV_DIR) if f.endswith(".csv")]
    print(f"Found {len(csv_files_for_extraction)} CSV files in '{COMMENTS_CSV_DIR}' for PDF text extraction.")
    if csv_files_for_extraction:
        print(f"First few files: {csv_files_for_extraction[:5]}")
else:
    print(f"Directory '{COMMENTS_CSV_DIR}' not found. No CSV files to process for PDF extraction.")

# The list 'csv_files_for_extraction' will be used in the next cell.

In [None]:
"""
Extract text from PDFs linked in comment metadata CSVs.
"""
from text_extractor import PDFTextExtractor # Ensure PDFTextExtractor is importable
import os

# TODO: Adapt this method to read directly from the EPA_Comments_Combined database instead of from CSV

# Configuration
BASE_OUTPUT_DIR_FOR_EXTRACTIONS = "documents_extracted_texts"
DOWNLOAD_PDFS_OPTION = False 

print(f"\n--- Starting PDF Text Extraction Process ---")
if not csv_files_for_extraction: 
    print("No CSV files listed for extraction. Skipping.")
else:
    os.makedirs(BASE_OUTPUT_DIR_FOR_EXTRACTIONS, exist_ok=True)
    
    for i, Scomments_csv_filename in enumerate(csv_files_for_extraction):
        print(f"\nProcessing file {i+1}/{len(csv_files_for_extraction)}: {Scomments_csv_filename}")
        
        # Construct a base name for output files from the input CSV filename
        # E.g., if input is "comments_on_EPA-XYZ.csv", base_name is "EPA-XYZ"
        base_name = Scomments_csv_filename.replace("comments_on_", "").replace(".csv", "")
        
        # Define a specific output directory for this docket's extracted texts
        # E.g., "documents_extracted_texts/EPA-XYZ/"
        specific_docket_output_dir = os.path.join(BASE_OUTPUT_DIR_FOR_EXTRACTIONS, base_name)
        os.makedirs(specific_docket_output_dir, exist_ok=True)

        # Define output filenames (these will be placed in specific_docket_output_dir by the extractor)
        output_csv_name = f"{base_name}_extracted_texts.csv"
        output_txt_name = f"{base_name}_extracted_texts.txt"
        
        full_input_csv_path = os.path.join(COMMENTS_CSV_DIR, Scomments_csv_filename)

        try:
            extractor = PDFTextExtractor(
                csv_file=full_input_csv_path,
                download_pdfs=DOWNLOAD_PDFS_OPTION,
                output_csv=output_csv_name,  # Filename only
                output_txt=output_txt_name,  # Filename only
                output_dir=specific_docket_output_dir # Directory for outputs
            )
            extractor.process_csv() # This method should handle the actual extraction and saving
            print(f"Finished processing PDFs for {Scomments_csv_filename}.")
        except Exception as e:
            print(f"Error processing {Scomments_csv_filename} with PDFTextExtractor: {e}")

print("\n--- PDF Text Extraction Process Finished ---")