In [4]:
import sys
import os
from urllib.parse import urljoin
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

base_url = 'https://com-courtlistener-storage.s3-us-west-2.amazonaws.com/list.html?prefix=pdf/'

In [None]:
for year in range(2020, 2025):
    for month in range(1, 13):
        for day in range(1, 32):
            print(day)
            date_str = f"{year}/{month:02d}/{day:02d}"
            date_url = base_url+date_str+'/'

            try:
                response = requests.get(date_url, timeout=5)
                if response.status_code != 200:
                    print('not processing')
                    continue
                soup = BeautifulSoup(response.text,'html.parser')
                print(soup)
                break
                links = [link.get('href') for link in soup.find_all('a') if link.get('href')]
                print(links)
                break

            except requests.RequestException:
                print(date_url)
                continue

In [1]:
import os
import requests
import json
import time
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_s3_listing(bucket_name, prefix=""):
    """
    Get file listing from S3 bucket using the S3 API
    """
    s3_url = f"https://{bucket_name}.s3.amazonaws.com/"
    list_url = f"{s3_url}?list-type=2&prefix={prefix}"
    
    try:
        response = requests.get(list_url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to get listing for {prefix}: {response.status_code}")
            return []
        
        # Parse the XML response to extract file keys
        # This is a simplified version - in practice you'd use an XML parser
        content = response.text
        files = []
        
        # Simple extraction of Key elements from XML
        lines = content.split('<Key>')
        for line in lines[1:]:  # Skip the first part before the first <Key>
            key = line.split('</Key>')[0]
            if not key.endswith('/'):  # Skip directories
                files.append(key)
        
        return files
    except Exception as e:
        print(f"Error accessing S3 API for {prefix}: {str(e)}")
        return []

def download_pdf(url, filepath):
    """Download a PDF file from a URL and save it to the specified path"""
    try:
        response = requests.get(url, stream=True, timeout=30)
        response.raise_for_status()
        
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"Downloaded: {filepath}")
        return True
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
        return False

def download_with_retry(url, filepath, max_retries=3):
    """Download a file with retry logic"""
    for attempt in range(max_retries):
        if download_pdf(url, filepath):
            return True
        print(f"Retry {attempt + 1}/{max_retries} for {os.path.basename(filepath)}")
        time.sleep(1)  # Wait before retrying
    return False

def main():
    base_url = "https://storage.courthistener.com/pdf/"
    bucket_name = "com-courtlistener-storage"
    
    # Define the date range (2020 to 2024)
    years = range(2020, 2021)
    months = range(1, 2)
    days = range(1, 32)
    
    # Create a directory for downloads
    os.makedirs("court_listener_pdfs", exist_ok=True)
    
    # For tracking downloaded files
    downloaded_files = set()
    
    # Use ThreadPoolExecutor for parallel downloads
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {}
        
        # Iterate through all dates in the range
        for year in years:
            for month in months:
                for day in days:
                    # Format the date components with leading zeros
                    date_str = f"{year}/{month:02d}/{day:02d}"
                    prefix = f"pdf/{date_str}/"
                    
                    print(f"Checking {date_str}...")
                    
                    # Get file listing from S3
                    files = get_s3_listing(bucket_name, prefix)
                    
                    if not files:
                        continue  # Skip if no files found
                    
                    print(f"Found {len(files)} files in {date_str}")
                    
                    # Download each file
                    for file_key in files:
                        filename = os.path.basename(file_key)
                        file_url = f"https://{bucket_name}.s3.amazonaws.com/{file_key}"
                        safe_filename = f"{year}-{month:02d}-{day:02d}_{filename}".replace("/", "_").replace("\\", "_")
                        filepath = os.path.join("court_listener_pdfs", safe_filename)
                        
                        # Skip if already downloaded
                        if safe_filename in downloaded_files:
                            continue
                            
                        # Submit download task to thread pool
                        future = executor.submit(download_with_retry, file_url, filepath)
                        future_to_url[future] = file_url
                        downloaded_files.add(safe_filename)
        
        # Wait for all downloads to complete
        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                future.result()
            except Exception as e:
                print(f"Download failed for {url}: {str(e)}")

if __name__ == "__main__":
    main()

Checking 2020/01/01...
Checking 2020/01/02...
Found 263 files in 2020/01/02
Checking 2020/01/03...
Found 304 files in 2020/01/03
Checking 2020/01/04...
Downloaded: court_listener_pdfs/2020-01-02_alyssa_mascorro_v._peter_schulz_geico_insurance_company_cullen_walsh_and.pdf
Downloaded: court_listener_pdfs/2020-01-02_apex_laboratories_international_inc_v._city_of_detroit.pdf
Downloaded: court_listener_pdfs/2020-01-02_adrian_booker_and_nicole_smith_v._anissa_mahmoudi.pdf
Downloaded: court_listener_pdfs/2020-01-02_adoption_of_c.m.b._appeal_of_d.b..pdf
Downloaded: court_listener_pdfs/2020-01-02_am._multi-cinema_inc._v._city_of_aurora.pdf
Checking 2020/01/05...
Downloaded: court_listener_pdfs/2020-01-02_brenda_ford_white_v._o_l_matthews_md.pdf
Downloaded: court_listener_pdfs/2020-01-02_brenda_white_v._southeast_michigan_surgical_hospital.pdf
Downloaded: court_listener_pdfs/2020-01-02_blanche_hudson_v._john_c_kleuessendorf.pdf
Downloaded: court_listener_pdfs/2020-01-02_bank_of_new_york_mellon_v