# Debug S3 Download Issues

Diagnose why downloads are hanging.

In [None]:
# Configuration
import os

S3_ENDPOINT = "https://minio-api-minio.apps.meshtest.llnl.gov"
S3_BUCKET = "kb-documents"
S3_PREFIX = "data/"
S3_ACCESS_KEY = "minioadmin"
S3_SECRET_KEY = "minioadmin"

DOWNLOAD_PATH = "/tmp/documents"

## Step 1: Just LIST files (don't download)

In [None]:
import boto3
import warnings
warnings.filterwarnings('ignore')  # Suppress SSL warnings

print(f"Listing files in: {S3_ENDPOINT}/{S3_BUCKET}/{S3_PREFIX}")

s3_client = boto3.client(
    's3',
    endpoint_url=S3_ENDPOINT,
    aws_access_key_id=S3_ACCESS_KEY,
    aws_secret_access_key=S3_SECRET_KEY,
    verify=False
)

# List files
paginator = s3_client.get_paginator('list_objects_v2')
all_files = []

for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=S3_PREFIX):
    if 'Contents' not in page:
        continue
    
    for obj in page['Contents']:
        if not obj['Key'].endswith('/'):
            all_files.append({
                'key': obj['Key'],
                'size': obj['Size'],
                'size_mb': obj['Size'] / (1024 * 1024)
            })

print(f"\nFound {len(all_files)} files")
print(f"Total size: {sum(f['size'] for f in all_files) / (1024*1024):.2f} MB")

# Show largest files
print("\nLargest files:")
sorted_files = sorted(all_files, key=lambda x: x['size'], reverse=True)
for f in sorted_files[:10]:
    print(f"  {f['size_mb']:.2f} MB - {f['key']}")

# Show first few files
print("\nFirst 10 files:")
for f in all_files[:10]:
    print(f"  {f['size_mb']:.4f} MB - {f['key']}")

## Step 2: Download ONE file with timeout

In [None]:
from pathlib import Path
import time
from botocore.config import Config

# Configure with explicit timeout
config = Config(
    connect_timeout=10,
    read_timeout=30,
    retries={'max_attempts': 3}
)

s3_client = boto3.client(
    's3',
    endpoint_url=S3_ENDPOINT,
    aws_access_key_id=S3_ACCESS_KEY,
    aws_secret_access_key=S3_SECRET_KEY,
    verify=False,
    config=config
)

if all_files:
    test_file = all_files[0]
    print(f"Testing download: {test_file['key']}")
    print(f"Size: {test_file['size_mb']:.4f} MB")
    
    # Calculate local path
    relative_path = test_file['key'][len(S3_PREFIX):] if S3_PREFIX else test_file['key']
    local_file = os.path.join(DOWNLOAD_PATH, relative_path)
    
    # Create directory
    Path(local_file).parent.mkdir(parents=True, exist_ok=True)
    
    try:
        start = time.time()
        s3_client.download_file(S3_BUCKET, test_file['key'], local_file)
        elapsed = time.time() - start
        
        print(f"SUCCESS: Downloaded in {elapsed:.2f} seconds")
        print(f"Speed: {test_file['size_mb']/elapsed:.2f} MB/s")
        print(f"Local file: {local_file}")
        print(f"Exists: {os.path.exists(local_file)}")
        print(f"Size: {os.path.getsize(local_file)} bytes")
    except Exception as e:
        print(f"ERROR: {e}")
        import traceback
        traceback.print_exc()

## Step 3: Download first 10 files with progress

In [None]:
from pathlib import Path
import time

# Download first 10 files
test_batch = all_files[:10]
print(f"Downloading {len(test_batch)} files...\n")

successful = 0
failed = 0

for i, file_info in enumerate(test_batch, 1):
    s3_key = file_info['key']
    
    # Calculate local path
    relative_path = s3_key[len(S3_PREFIX):] if S3_PREFIX else s3_key
    local_file = os.path.join(DOWNLOAD_PATH, relative_path)
    
    # Create directory
    Path(local_file).parent.mkdir(parents=True, exist_ok=True)
    
    print(f"[{i}/{len(test_batch)}] {s3_key} ({file_info['size_mb']:.4f} MB)")
    
    try:
        start = time.time()
        s3_client.download_file(S3_BUCKET, s3_key, local_file)
        elapsed = time.time() - start
        
        print(f"  SUCCESS in {elapsed:.2f}s ({file_info['size_mb']/elapsed:.2f} MB/s)")
        successful += 1
    except Exception as e:
        print(f"  FAILED: {e}")
        failed += 1
    
    print()  # Blank line

print(f"\nResults: {successful} successful, {failed} failed")

## Alternative: Use shell commands (might be faster)

In [None]:
# If boto3 is too slow, try using s3cmd or aws cli via subprocess
# First check if they're available:

!which s3cmd
!which aws

## Check what's already downloaded

In [None]:
import subprocess

if os.path.exists(DOWNLOAD_PATH):
    # Count files already downloaded
    result = subprocess.run(
        f'find {DOWNLOAD_PATH} -type f | wc -l',
        shell=True,
        capture_output=True,
        text=True
    )
    
    print(f"Files already in {DOWNLOAD_PATH}: {result.stdout.strip()}")
    
    # Show directory structure
    print(f"\nDirectory structure:")
    !ls -lahR {DOWNLOAD_PATH} | head -50
else:
    print(f"{DOWNLOAD_PATH} does not exist")