In [1]:
from google.cloud import storage
from datetime import datetime
import argparse
from tqdm import tqdm  # For progress tracking

def list_json_files(bucket_name: str, max_files: int = None) -> list[str]:
    """Properly paginated json file lister with progress tracking"""
    client = storage.Client(project='creature-vision')
    bucket = client.bucket(bucket_name)
    json_files = []
    
    # Configure pagination to maximize throughput
    blobs = bucket.list_blobs(page_size=1000)
    
    with tqdm(desc="Processing blobs", unit="file") as pbar:
        try:
            for blob in blobs:
                if blob.name.lower().endswith('.json'):
                    json_files.append(blob.name)
                    
                    # Enforce max_files even between pages
                    if max_files and len(json_files) >= max_files:
                        break
                
                # Update progress for every blob processed
                pbar.update(1)
                
                # Early exit if max_files reached
                if max_files and len(json_files) >= max_files:
                    blobs._page_iterator.next_page_token = None  # Force stop pagination
                    break
                    
        except KeyboardInterrupt:
            print("\nListing interrupted by user")
    
    return json_files


In [2]:
start_time = datetime.now()
print(f"Starting file list generation at {start_time}")

try:
    json_list = list_json_files("creature-vision-training-set")
    
    # Write to file with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"gcs_json_files_{timestamp}.txt"
    
    with open(output_file, 'w') as f:
        f.write('\n'.join(json_list))
        
    print(f"\nSuccessfully listed {len(json_list)} Json files")
    print(f"Output saved to {output_file}")
    print(f"Total execution time: {datetime.now() - start_time}")
    
except Exception as e:
    print(f"Error listing files: {str(e)}")

Starting file list generation at 2025-03-08 12:54:09.316178


Processing blobs: 28997file [00:04, 6109.29file/s]


Successfully listed 14474 Json files
Output saved to gcs_json_files_20250308_125414.txt
Total execution time: 0:00:05.088461



