# Model Download and Restore from S3 Notebook

This notebook provides functionality to:
1. List and discover model archives in S3 bucket
2. Download zip files from AWS S3 bucket
3. Restore the original directory structure
4. Verify file integrity and completeness
5. Provide progress tracking and error handling

## Prerequisites
- AWS credentials configured (access key, secret key)
- boto3 installed (`pip install boto3`)
- Sufficient disk space for downloading and extracting files
- Original upload manifest files (recommended for verification)


## 1. Configuration Section

**⚠️ Security Note**: Never commit AWS credentials to version control. Use environment variables or AWS credentials file.


In [None]:
# Configuration - UPDATE THESE VALUES
import os
from datetime import datetime

# AWS Configuration
AWS_ACCESS_KEY_ID = "your-access-key-here"  # Replace with your access key
AWS_SECRET_ACCESS_KEY = "your-secret-key-here"  # Replace with your secret key
AWS_REGION = "us-east-1"  # Replace with your preferred region
S3_BUCKET_NAME = "your-bucket-name"  # Replace with your S3 bucket name

# Alternatively, use environment variables (recommended for security)
# AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
# AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
# AWS_REGION = os.getenv('AWS_DEFAULT_REGION', 'us-east-1')
# S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')

# Project Configuration
PROJECT_ROOT = "/Users/shubhangmalviya/Documents/Projects/Walsh College/HistoPathologyResearch"
ARTIFACTS_DIR = os.path.join(PROJECT_ROOT, "artifacts")
DOWNLOAD_DIR = os.path.join(PROJECT_ROOT, "downloads")
RESTORE_DIR = os.path.join(PROJECT_ROOT, "restored_artifacts")

# S3 Configuration - you can specify a specific date or leave empty to list all
TARGET_DATE = ""  # Format: "2024-01-15" or leave empty to show all dates
S3_KEY_PREFIX = f"histopathology-research/{TARGET_DATE}" if TARGET_DATE else "histopathology-research"

# Download Options
OVERWRITE_EXISTING = False  # Set to True to overwrite existing files
VERIFY_DOWNLOADS = True     # Set to True to verify file integrity after download

print(f"✅ Configuration loaded")
print(f"📁 Project root: {PROJECT_ROOT}")
print(f"💾 Download directory: {DOWNLOAD_DIR}")
print(f"📂 Restore directory: {RESTORE_DIR}")
print(f"🪣 S3 Bucket: {S3_BUCKET_NAME}")
print(f"🔍 S3 Search prefix: {S3_KEY_PREFIX}")
print(f"🔄 Overwrite existing: {OVERWRITE_EXISTING}")
print(f"✅ Verify downloads: {VERIFY_DOWNLOADS}")


## 2. Import Required Libraries


In [None]:
import os
import zipfile
import boto3
import json
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional, Tuple
import hashlib
from tqdm.notebook import tqdm
import warnings
import shutil
from botocore.exceptions import NoCredentialsError, ClientError

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("📚 All libraries imported successfully")


## 3. Utility Functions


In [None]:
def format_bytes(bytes_size: int) -> str:
    """Convert bytes to human readable format."""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes_size < 1024.0:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024.0
    return f"{bytes_size:.2f} PB"

def calculate_file_hash(filepath: str) -> str:
    """Calculate MD5 hash of a file."""
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def create_directories():
    """Create necessary directories if they don't exist."""
    for directory in [DOWNLOAD_DIR, RESTORE_DIR]:
        os.makedirs(directory, exist_ok=True)
    print(f"📁 Download directory ready: {DOWNLOAD_DIR}")
    print(f"📁 Restore directory ready: {RESTORE_DIR}")

def parse_s3_key(s3_key: str) -> Dict:
    """Parse S3 key to extract useful information."""
    parts = s3_key.split('/')
    info = {
        'full_key': s3_key,
        'filename': parts[-1],
        'project': parts[0] if len(parts) > 0 else '',
        'date': parts[1] if len(parts) > 1 else '',
        'category': parts[2] if len(parts) > 2 else '',
        'research_question': parts[3] if len(parts) > 3 else ''
    }
    return info

def format_timestamp(timestamp_str: str) -> str:
    """Format timestamp string to human readable format."""
    try:
        # Parse timestamp from filename (format: YYYYMMDD_HHMMSS)
        if '_' in timestamp_str:
            date_part, time_part = timestamp_str.split('_')
            formatted_date = f"{date_part[:4]}-{date_part[4:6]}-{date_part[6:8]}"
            formatted_time = f"{time_part[:2]}:{time_part[2:4]}:{time_part[4:6]}"
            return f"{formatted_date} {formatted_time}"
        return timestamp_str
    except:
        return timestamp_str

print("🔧 Utility functions defined")


## 4. AWS S3 Functions


In [None]:
def initialize_s3_client():
    """Initialize and test S3 client."""
    try:
        s3_client = boto3.client(
            's3',
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
            region_name=AWS_REGION
        )
        
        # Test connection by listing buckets
        response = s3_client.list_buckets()
        
        # Check if our target bucket exists
        bucket_names = [bucket['Name'] for bucket in response['Buckets']]
        
        if S3_BUCKET_NAME in bucket_names:
            print(f"✅ S3 client initialized successfully")
            print(f"🪣 Target bucket '{S3_BUCKET_NAME}' found")
        else:
            print(f"⚠️  Warning: Bucket '{S3_BUCKET_NAME}' not found in your account")
            print(f"Available buckets: {bucket_names}")
            
        return s3_client
        
    except Exception as e:
        print(f"❌ Failed to initialize S3 client: {e}")
        return None

def list_s3_objects(s3_client, prefix: str = "") -> List[Dict]:
    """List all objects in S3 bucket with given prefix."""
    objects = []
    
    try:
        paginator = s3_client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(
            Bucket=S3_BUCKET_NAME,
            Prefix=prefix
        )
        
        for page in page_iterator:
            if 'Contents' in page:
                for obj in page['Contents']:
                    obj_info = {
                        'key': obj['Key'],
                        'size': obj['Size'],
                        'last_modified': obj['LastModified'],
                        'formatted_size': format_bytes(obj['Size']),
                        'parsed': parse_s3_key(obj['Key'])
                    }
                    objects.append(obj_info)
        
        return objects
        
    except Exception as e:
        print(f"❌ Failed to list S3 objects: {e}")
        return []

def download_file_from_s3(s3_client, s3_key: str, local_path: str) -> Dict:
    """Download a single file from S3 with progress tracking."""
    download_info = {
        's3_key': s3_key,
        'local_path': local_path,
        'bucket': S3_BUCKET_NAME,
        'success': False,
        'download_time': None,
        'file_size': 0,
        'error': None
    }
    
    try:
        # Get object info first
        response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=s3_key)
        file_size = response['ContentLength']
        download_info['file_size'] = file_size
        
        print(f"⬇️  Downloading: {os.path.basename(s3_key)} ({format_bytes(file_size)})")
        
        # Check if file already exists
        if os.path.exists(local_path) and not OVERWRITE_EXISTING:
            print(f"⚠️  File already exists, skipping: {local_path}")
            download_info['success'] = True
            download_info['download_time'] = 0
            return download_info
        
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        start_time = datetime.now()
        
        # Download with progress tracking
        def progress_callback(bytes_transferred):
            percentage = (bytes_transferred / file_size) * 100 if file_size > 0 else 0
            print(f"\r   Progress: {percentage:.1f}% ({format_bytes(bytes_transferred)}/{format_bytes(file_size)})", end="")
        
        s3_client.download_file(
            S3_BUCKET_NAME, 
            s3_key, 
            local_path,
            Callback=progress_callback
        )
        
        end_time = datetime.now()
        download_info['download_time'] = (end_time - start_time).total_seconds()
        download_info['success'] = True
        
        print(f"\n✅ Download completed in {download_info['download_time']:.2f} seconds")
        print(f"   💾 Local path: {local_path}")
        
    except Exception as e:
        download_info['error'] = str(e)
        print(f"\n❌ Download failed: {e}")
    
    return download_info

print("☁️  S3 functions ready")


## 5. Archive Extraction Functions


In [None]:
def extract_zip_archive(zip_path: str, extract_to: str, preserve_structure: bool = True) -> Dict:
    """Extract zip archive while preserving or restoring directory structure."""
    extract_info = {
        'zip_path': zip_path,
        'extract_to': extract_to,
        'extracted_at': datetime.now().isoformat(),
        'files_extracted': 0,
        'total_size': 0,
        'success': False,
        'files_list': [],
        'error': None
    }
    
    try:
        print(f"📦 Extracting archive: {os.path.basename(zip_path)}")
        print(f"   📁 Destination: {extract_to}")
        
        with zipfile.ZipFile(zip_path, 'r') as zipf:
            # Get list of files in archive
            file_list = zipf.namelist()
            total_files = len(file_list)
            
            print(f"   📄 Files in archive: {total_files}")
            
            # Create destination directory
            os.makedirs(extract_to, exist_ok=True)
            
            # Extract with progress bar
            with tqdm(total=total_files, desc="Extracting files") as pbar:
                for file_info in zipf.filelist:
                    try:
                        # Extract file
                        zipf.extract(file_info, extract_to)
                        
                        # Update statistics
                        extract_info['files_extracted'] += 1
                        extract_info['total_size'] += file_info.file_size
                        extract_info['files_list'].append({
                            'filename': file_info.filename,
                            'size': file_info.file_size,
                            'compressed_size': file_info.compress_size
                        })
                        
                    except Exception as e:
                        print(f"⚠️  Warning: Could not extract {file_info.filename}: {e}")
                    
                    pbar.update(1)
        
        extract_info['success'] = True
        print(f"✅ Extraction completed successfully!")
        print(f"   📄 Files extracted: {extract_info['files_extracted']}")
        print(f"   📏 Total size: {format_bytes(extract_info['total_size'])}")
        
    except Exception as e:
        extract_info['error'] = str(e)
        print(f"❌ Extraction failed: {e}")
    
    return extract_info

def restore_to_original_structure(extracted_path: str, target_artifacts_dir: str, research_question: str) -> Dict:
    """Restore extracted files to original artifacts directory structure."""
    restore_info = {
        'source_path': extracted_path,
        'target_path': target_artifacts_dir,
        'research_question': research_question,
        'files_moved': 0,
        'success': False,
        'error': None
    }
    
    try:
        print(f"🔄 Restoring {research_question.upper()} to original structure...")
        
        # Create target directory structure
        target_rq_dir = os.path.join(target_artifacts_dir, research_question)
        os.makedirs(target_rq_dir, exist_ok=True)
        
        # Move files while preserving structure
        if os.path.exists(extracted_path):
            for root, dirs, files in os.walk(extracted_path):
                for file in files:
                    source_file = os.path.join(root, file)
                    
                    # Calculate relative path
                    rel_path = os.path.relpath(source_file, extracted_path)
                    target_file = os.path.join(target_rq_dir, rel_path)
                    
                    # Create target directory if needed
                    os.makedirs(os.path.dirname(target_file), exist_ok=True)
                    
                    # Copy or move file
                    if OVERWRITE_EXISTING or not os.path.exists(target_file):
                        shutil.copy2(source_file, target_file)
                        restore_info['files_moved'] += 1
                    else:
                        print(f"⚠️  Skipping existing file: {target_file}")
        
        restore_info['success'] = True
        print(f"✅ Restoration completed!")
        print(f"   📄 Files restored: {restore_info['files_moved']}")
        print(f"   📁 Target directory: {target_rq_dir}")
        
    except Exception as e:
        restore_info['error'] = str(e)
        print(f"❌ Restoration failed: {e}")
    
    return restore_info

print("📦 Archive extraction functions ready")


## 6. Initialize and Discover S3 Content


In [None]:
# Create necessary directories
create_directories()

# Initialize S3 client
print("\n🔐 Initializing AWS S3 connection...")
s3_client = initialize_s3_client()

if s3_client is None:
    print("\n❌ Cannot proceed with S3 operations. Please check your AWS credentials.")
    print("\n🔧 Troubleshooting steps:")
    print("   1. Verify AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY")
    print("   2. Check if the specified region is correct")
    print("   3. Ensure your AWS account has S3 permissions")
    print("   4. Verify the bucket name exists and you have access")
else:
    print("\n🔍 Discovering available files in S3...")
    
    # List all objects with the prefix
    s3_objects = list_s3_objects(s3_client, S3_KEY_PREFIX)
    
    if not s3_objects:
        print(f"❌ No objects found with prefix: {S3_KEY_PREFIX}")
        print("💡 Try adjusting the TARGET_DATE in configuration or check if files were uploaded")
    else:
        print(f"\n📊 DISCOVERY RESULTS:")
        print(f"   📄 Total objects found: {len(s3_objects)}")
        
        # Categorize objects
        archives = [obj for obj in s3_objects if obj['key'].endswith('.zip') and 'artifacts_' in obj['key']]
        manifests = [obj for obj in s3_objects if obj['key'].endswith('.json') and 'manifest' in obj['key']]
        
        print(f"   📦 Archive files: {len(archives)}")
        print(f"   📄 Manifest files: {len(manifests)}")
        
        # Display available archives by date and research question
        if archives:
            print(f"\n📦 AVAILABLE ARCHIVES:")
            for archive in archives:
                parsed = archive['parsed']
                timestamp_part = archive['parsed']['filename'].split('_')[-1].replace('.zip', '')
                formatted_time = format_timestamp(timestamp_part)
                
                print(f"   🗂️  {parsed['research_question'].upper()}: {archive['formatted_size']} - {formatted_time}")
                print(f"      📍 S3 Key: {archive['key']}")
                print(f"      📅 Last Modified: {archive['last_modified'].strftime('%Y-%m-%d %H:%M:%S')}")
                print()
        
        # Display available manifests
        if manifests:
            print(f"📄 AVAILABLE MANIFESTS:")
            for manifest in manifests:
                parsed = manifest['parsed']
                timestamp_part = manifest['parsed']['filename'].split('_')[-1].replace('.json', '')
                formatted_time = format_timestamp(timestamp_part)
                
                print(f"   📋 {manifest['formatted_size']} - {formatted_time}")
                print(f"      📍 S3 Key: {manifest['key']}")
                print()


## 7. Download Archives from S3


In [None]:
if s3_client and 'archives' in locals() and archives:
    print("⬇️  Starting download process...")
    
    download_session_id = datetime.now().strftime('%Y%m%d_%H%M%S')
    download_results = []
    
    print(f"📅 Download session ID: {download_session_id}\n")
    
    for archive in archives:
        print(f"\n📦 Processing {archive['parsed']['research_question'].upper()} archive...")
        
        # Create local path maintaining structure
        local_filename = os.path.basename(archive['key'])
        local_path = os.path.join(DOWNLOAD_DIR, local_filename)
        
        # Download archive
        download_result = download_file_from_s3(s3_client, archive['key'], local_path)
        download_results.append(download_result)
        
        # Verify download if enabled
        if VERIFY_DOWNLOADS and download_result['success']:
            print(f"🔍 Verifying download...")
            if os.path.exists(local_path):
                local_size = os.path.getsize(local_path)
                if local_size == download_result['file_size']:
                    print(f"✅ Verification passed: File size matches ({format_bytes(local_size)})")
                else:
                    print(f"❌ Verification failed: Size mismatch (Expected: {format_bytes(download_result['file_size'])}, Got: {format_bytes(local_size)})")
            else:
                print(f"❌ Verification failed: Downloaded file not found")
    
    # Download manifests if available
    if 'manifests' in locals() and manifests:
        print(f"\n📄 Downloading manifest files...")
        for manifest in manifests:
            local_filename = os.path.basename(manifest['key'])
            local_path = os.path.join(DOWNLOAD_DIR, local_filename)
            
            manifest_result = download_file_from_s3(s3_client, manifest['key'], local_path)
            download_results.append(manifest_result)
    
    # Create download summary
    successful_downloads = [r for r in download_results if r['success']]
    failed_downloads = [r for r in download_results if not r['success']]
    
    print(f"\n🎉 DOWNLOAD SUMMARY:")
    print(f"   ✅ Successful downloads: {len(successful_downloads)}")
    print(f"   ❌ Failed downloads: {len(failed_downloads)}")
    print(f"   📏 Total downloaded: {format_bytes(sum(r['file_size'] for r in successful_downloads))}")
    print(f"   ⏱️  Total time: {sum(r.get('download_time', 0) for r in successful_downloads):.2f} seconds")
    
    if failed_downloads:
        print(f"\n❌ FAILED DOWNLOADS:")
        for failed in failed_downloads:
            print(f"   📄 {os.path.basename(failed['s3_key'])}: {failed.get('error', 'Unknown error')}")

else:
    if not s3_client:
        print("❌ S3 client not initialized. Cannot download.")
    elif 'archives' not in locals() or not archives:
        print("❌ No archives found to download.")
    else:
        print("❌ Unknown error occurred.")


## 8. Extract and Restore Archives


In [None]:
if 'successful_downloads' in locals() and successful_downloads:
    print("📦 Starting extraction and restoration process...")
    
    extraction_results = []
    restoration_results = []
    
    # Filter for archive files only (not manifests)
    archive_downloads = [d for d in successful_downloads if d['s3_key'].endswith('.zip') and 'artifacts_' in d['s3_key']]
    
    for download in archive_downloads:
        zip_path = download['local_path']
        
        if not os.path.exists(zip_path):
            print(f"⚠️  Skipping missing file: {zip_path}")
            continue
        
        # Determine research question from filename
        filename = os.path.basename(zip_path)
        if 'artifacts_rq2_' in filename:
            research_question = 'rq2'
        elif 'artifacts_rq3_' in filename:
            research_question = 'rq3'
        else:
            print(f"⚠️  Cannot determine research question from filename: {filename}")
            continue
        
        print(f"\n🗂️  Processing {research_question.upper()} archive: {filename}")
        
        # Create temporary extraction directory
        temp_extract_dir = os.path.join(DOWNLOAD_DIR, f"temp_extract_{research_question}")
        
        # Extract archive
        extract_info = extract_zip_archive(zip_path, temp_extract_dir)
        extraction_results.append(extract_info)
        
        if extract_info['success']:
            # Restore to original structure
            restore_info = restore_to_original_structure(
                temp_extract_dir, 
                RESTORE_DIR, 
                research_question
            )
            restoration_results.append(restore_info)
            
            # Clean up temporary extraction directory
            try:
                shutil.rmtree(temp_extract_dir)
                print(f"🧹 Cleaned up temporary directory: {temp_extract_dir}")
            except Exception as e:
                print(f"⚠️  Could not clean up temporary directory: {e}")
        else:
            print(f"❌ Skipping restoration due to extraction failure")
    
    # Summary of extraction and restoration
    successful_extractions = [r for r in extraction_results if r['success']]
    successful_restorations = [r for r in restoration_results if r['success']]
    
    print(f"\n🎉 EXTRACTION & RESTORATION SUMMARY:")
    print(f"   📦 Archives processed: {len(archive_downloads)}")
    print(f"   ✅ Successful extractions: {len(successful_extractions)}")
    print(f"   ✅ Successful restorations: {len(successful_restorations)}")
    
    if successful_restorations:
        total_files_restored = sum(r['files_moved'] for r in successful_restorations)
        print(f"   📄 Total files restored: {total_files_restored}")
        print(f"   📁 Restoration directory: {RESTORE_DIR}")
        
        print(f"\n📂 RESTORED STRUCTURE:")
        for restore in successful_restorations:
            rq_dir = os.path.join(RESTORE_DIR, restore['research_question'])
            if os.path.exists(rq_dir):
                subdirs = [d for d in os.listdir(rq_dir) if os.path.isdir(os.path.join(rq_dir, d))]
                print(f"   {restore['research_question'].upper()}: {restore['files_moved']} files")
                if subdirs:
                    print(f"      📂 Subdirectories: {', '.join(subdirs)}")
    
    # Check if we should copy to original artifacts directory
    copy_to_original = input(f"\n❓ Copy restored files to original artifacts directory ({ARTIFACTS_DIR})? [y/N]: ").lower().strip()
    
    if copy_to_original == 'y':
        print(f"\n🔄 Copying to original artifacts directory...")
        
        for restore in successful_restorations:
            source_dir = os.path.join(RESTORE_DIR, restore['research_question'])
            target_dir = os.path.join(ARTIFACTS_DIR, restore['research_question'])
            
            if os.path.exists(source_dir):
                try:
                    # Create target directory
                    os.makedirs(target_dir, exist_ok=True)
                    
                    # Copy files
                    for root, dirs, files in os.walk(source_dir):
                        for file in files:
                            source_file = os.path.join(root, file)
                            rel_path = os.path.relpath(source_file, source_dir)
                            target_file = os.path.join(target_dir, rel_path)
                            
                            # Create target directory if needed
                            os.makedirs(os.path.dirname(target_file), exist_ok=True)
                            
                            if OVERWRITE_EXISTING or not os.path.exists(target_file):
                                shutil.copy2(source_file, target_file)
                                print(f"📄 Copied: {rel_path}")
                            else:
                                print(f"⚠️  Skipped existing: {rel_path}")
                    
                    print(f"✅ {restore['research_question'].upper()} copied to {target_dir}")
                    
                except Exception as e:
                    print(f"❌ Failed to copy {restore['research_question'].upper()}: {e}")
        
        print(f"\n✅ Copy operation completed!")
    else:
        print(f"\n💡 Files remain in restoration directory: {RESTORE_DIR}")

else:
    print("❌ No successful downloads found to extract and restore.")


## 9. Verification and Integrity Check


In [None]:
def verify_restored_structure(restore_dir: str) -> Dict:
    """Verify the restored directory structure and files."""
    verification_results = {
        'total_files': 0,
        'total_size': 0,
        'research_questions': {},
        'verification_time': datetime.now().isoformat()
    }
    
    print("🔍 Verifying restored structure...")
    
    if not os.path.exists(restore_dir):
        print(f"❌ Restore directory not found: {restore_dir}")
        return verification_results
    
    # Check each research question directory
    for rq in ['rq2', 'rq3']:
        rq_path = os.path.join(restore_dir, rq)
        
        if os.path.exists(rq_path):
            rq_info = {
                'exists': True,
                'file_count': 0,
                'total_size': 0,
                'subdirectories': [],
                'sample_files': []
            }
            
            # Walk through directory
            for root, dirs, files in os.walk(rq_path):
                rq_info['file_count'] += len(files)
                
                for file in files:
                    file_path = os.path.join(root, file)
                    file_size = os.path.getsize(file_path)
                    rq_info['total_size'] += file_size
                    verification_results['total_size'] += file_size
                    
                    # Keep sample of files for verification
                    if len(rq_info['sample_files']) < 5:
                        rel_path = os.path.relpath(file_path, rq_path)
                        rq_info['sample_files'].append({
                            'path': rel_path,
                            'size': file_size,
                            'hash': calculate_file_hash(file_path) if file_size < 100*1024*1024 else 'skipped_large_file'  # Skip hash for files > 100MB
                        })
            
            # Get subdirectories
            if os.path.exists(rq_path):
                rq_info['subdirectories'] = [d for d in os.listdir(rq_path) if os.path.isdir(os.path.join(rq_path, d))]
            
            verification_results['research_questions'][rq] = rq_info
            verification_results['total_files'] += rq_info['file_count']
            
            print(f"✅ {rq.upper()}: {rq_info['file_count']} files, {format_bytes(rq_info['total_size'])}")
            if rq_info['subdirectories']:
                print(f"   📂 Subdirectories: {', '.join(rq_info['subdirectories'])}")
        else:
            verification_results['research_questions'][rq] = {'exists': False}
            print(f"❌ {rq.upper()}: Directory not found")
    
    print(f"\n📊 VERIFICATION SUMMARY:")
    print(f"   📄 Total files: {verification_results['total_files']}")
    print(f"   📏 Total size: {format_bytes(verification_results['total_size'])}")
    
    return verification_results

# Run verification if we have restored files
if 'successful_restorations' in locals() and successful_restorations:
    verification = verify_restored_structure(RESTORE_DIR)
    
    # Save verification report
    verification_report_path = os.path.join(DOWNLOAD_DIR, f"verification_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
    
    with open(verification_report_path, 'w') as f:
        json.dump(verification, f, indent=2)
    
    print(f"\n📄 Verification report saved: {verification_report_path}")
else:
    print("⚠️  No restored files to verify.")


## 10. Cleanup Options


In [None]:
# Optional cleanup of downloaded zip files
CLEANUP_DOWNLOADS = False  # Set to True to delete downloaded zip files after successful extraction

if CLEANUP_DOWNLOADS and 'successful_downloads' in locals():
    print("🧹 Cleaning up downloaded files...")
    
    # Only clean up zip files that were successfully extracted
    if 'successful_extractions' in locals():
        extracted_files = [r['zip_path'] for r in successful_extractions if r['success']]
        
        cleaned_count = 0
        for zip_path in extracted_files:
            if os.path.exists(zip_path):
                try:
                    os.remove(zip_path)
                    print(f"🗑️  Deleted: {os.path.basename(zip_path)}")
                    cleaned_count += 1
                except Exception as e:
                    print(f"⚠️  Could not delete {zip_path}: {e}")
        
        print(f"\n✅ Cleanup completed. Removed {cleaned_count} zip files.")
        print(f"💾 Verification reports and manifests retained.")
    else:
        print("⚠️  No successful extractions found. Keeping all downloaded files.")
else:
    if 'successful_downloads' in locals():
        print("📁 Downloaded files retained in download directory.")
        print(f"   📂 Location: {DOWNLOAD_DIR}")
        
        # List what we're keeping
        zip_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith('.zip')]
        json_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith('.json')]
        
        if zip_files:
            print(f"   📦 Zip files: {len(zip_files)}")
        if json_files:
            print(f"   📄 JSON files: {len(json_files)}")
    else:
        print("⚠️  No downloads to clean up.")


## 11. Final Summary


In [None]:
print("\n" + "="*70)
print("🎉 MODEL DOWNLOAD AND RESTORATION COMPLETED")
print("="*70)

session_id = datetime.now().strftime('%Y%m%d_%H%M%S')
print(f"\n📅 Session: {session_id}")
print(f"🪣 S3 Bucket: {S3_BUCKET_NAME}")
print(f"🔍 Search Prefix: {S3_KEY_PREFIX}")

# Summary of operations
if 's3_objects' in locals():
    print(f"\n🔍 DISCOVERY:")
    print(f"   📄 Objects found: {len(s3_objects)}")
    if 'archives' in locals():
        print(f"   📦 Archives: {len(archives)}")
    if 'manifests' in locals():
        print(f"   📋 Manifests: {len(manifests)}")

if 'download_results' in locals():
    successful_downloads = [r for r in download_results if r['success']]
    print(f"\n⬇️  DOWNLOADS:")
    print(f"   ✅ Successful: {len(successful_downloads)}")
    print(f"   📏 Total size: {format_bytes(sum(r['file_size'] for r in successful_downloads))}")

if 'extraction_results' in locals():
    successful_extractions = [r for r in extraction_results if r['success']]
    print(f"\n📦 EXTRACTIONS:")
    print(f"   ✅ Successful: {len(successful_extractions)}")
    if successful_extractions:
        total_extracted_files = sum(r['files_extracted'] for r in successful_extractions)
        print(f"   📄 Files extracted: {total_extracted_files}")

if 'restoration_results' in locals():
    successful_restorations = [r for r in restoration_results if r['success']]
    print(f"\n🔄 RESTORATIONS:")
    print(f"   ✅ Successful: {len(successful_restorations)}")
    if successful_restorations:
        total_restored_files = sum(r['files_moved'] for r in successful_restorations)
        print(f"   📄 Files restored: {total_restored_files}")

# Directory locations
print(f"\n📁 DIRECTORIES:")
print(f"   💾 Downloads: {DOWNLOAD_DIR}")
print(f"   📂 Restored: {RESTORE_DIR}")
print(f"   🏠 Original: {ARTIFACTS_DIR}")

# Next steps
print(f"\n💡 NEXT STEPS:")
print(f"   1. Verify your models are working correctly")
print(f"   2. Check file integrity using verification reports")
print(f"   3. Remove download files if no longer needed")
print(f"   4. Update any paths in your code if necessary")

# Important files to keep
print(f"\n📄 IMPORTANT FILES:")
if 'verification_report_path' in locals():
    print(f"   🔍 Verification report: {verification_report_path}")

manifest_files = []
if 'download_results' in locals():
    manifest_files = [r['local_path'] for r in download_results if r['s3_key'].endswith('.json') and r['success']]

if manifest_files:
    print(f"   📋 Manifest files:")
    for manifest_file in manifest_files:
        print(f"      - {manifest_file}")

print("\n" + "="*70)
print("✅ All operations completed successfully!")
print("🔒 Your model artifacts have been restored from S3 backup")
print("="*70)
