# Model Backup and S3 Upload Notebook

This notebook provides functionality to:
1. Zip model artifacts while preserving directory structure
2. Upload to AWS S3 bucket
3. Maintain the same folder structure in S3
4. Provide progress tracking and error handling

## Prerequisites
- AWS credentials configured (access key, secret key)
- boto3 installed (`pip install boto3`)
- Sufficient disk space for creating zip files


## 1. Configuration Section

**⚠️ Security Note**: Never commit AWS credentials to version control. Use environment variables or AWS credentials file.


In [None]:
# Configuration - UPDATE THESE VALUES
import os
from datetime import datetime

# AWS Configuration
AWS_ACCESS_KEY_ID = "your-access-key-here"  # Replace with your access key
AWS_SECRET_ACCESS_KEY = "your-secret-key-here"  # Replace with your secret key
AWS_REGION = "us-east-1"  # Replace with your preferred region
S3_BUCKET_NAME = "your-bucket-name"  # Replace with your S3 bucket name

# Alternatively, use environment variables (recommended for security)
# AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
# AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
# AWS_REGION = os.getenv('AWS_DEFAULT_REGION', 'us-east-1')
# S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')

# Project Configuration
PROJECT_ROOT = "/Users/shubhangmalviya/Documents/Projects/Walsh College/HistoPathologyResearch"
ARTIFACTS_DIR = os.path.join(PROJECT_ROOT, "artifacts")
BACKUP_DIR = os.path.join(PROJECT_ROOT, "backups")

# S3 Key Prefix (folder structure in S3)
S3_KEY_PREFIX = f"histopathology-research/{datetime.now().strftime('%Y-%m-%d')}"

print(f"✅ Configuration loaded")
print(f"📁 Artifacts directory: {ARTIFACTS_DIR}")
print(f"💾 Backup directory: {BACKUP_DIR}")
print(f"🪣 S3 Bucket: {S3_BUCKET_NAME}")
print(f"🔑 S3 Key Prefix: {S3_KEY_PREFIX}")


## 2. Import Required Libraries


In [None]:
import os
import zipfile
import boto3
import json
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import hashlib
from tqdm.notebook import tqdm
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("📚 All libraries imported successfully")


## 3. Utility Functions


In [None]:
def get_directory_size(directory: str) -> int:
    """Calculate total size of directory in bytes."""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if os.path.exists(filepath):
                total_size += os.path.getsize(filepath)
    return total_size

def format_bytes(bytes_size: int) -> str:
    """Convert bytes to human readable format."""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes_size < 1024.0:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024.0
    return f"{bytes_size:.2f} PB"

def calculate_file_hash(filepath: str) -> str:
    """Calculate MD5 hash of a file."""
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def create_backup_directory():
    """Create backup directory if it doesn't exist."""
    os.makedirs(BACKUP_DIR, exist_ok=True)
    print(f"📁 Backup directory ready: {BACKUP_DIR}")

print("🔧 Utility functions defined")


## 4. Directory Analysis


In [None]:
def analyze_artifacts_directory():
    """Analyze the artifacts directory structure and contents."""
    print("🔍 Analyzing artifacts directory...\n")
    
    analysis = {
        'total_size': 0,
        'directories': {},
        'file_count': 0
    }
    
    if not os.path.exists(ARTIFACTS_DIR):
        print(f"❌ Artifacts directory not found: {ARTIFACTS_DIR}")
        return analysis
    
    # Analyze each research question directory
    for rq_dir in ['rq2', 'rq3']:
        rq_path = os.path.join(ARTIFACTS_DIR, rq_dir)
        if os.path.exists(rq_path):
            size = get_directory_size(rq_path)
            file_count = sum(len(files) for _, _, files in os.walk(rq_path))
            
            analysis['directories'][rq_dir] = {
                'path': rq_path,
                'size': size,
                'file_count': file_count,
                'formatted_size': format_bytes(size)
            }
            
            analysis['total_size'] += size
            analysis['file_count'] += file_count
            
            print(f"📊 {rq_dir.upper()}:")
            print(f"   📁 Path: {rq_path}")
            print(f"   📏 Size: {format_bytes(size)}")
            print(f"   📄 Files: {file_count}")
            
            # List subdirectories
            subdirs = [d for d in os.listdir(rq_path) if os.path.isdir(os.path.join(rq_path, d))]
            if subdirs:
                print(f"   📂 Subdirectories: {', '.join(subdirs)}")
            print()
    
    print(f"📈 TOTAL ANALYSIS:")
    print(f"   📏 Total size: {format_bytes(analysis['total_size'])}")
    print(f"   📄 Total files: {analysis['file_count']}")
    
    return analysis

# Run analysis
artifacts_analysis = analyze_artifacts_directory()


## 5. Zip Creation Functions


In [None]:
def create_zip_archive(source_dir: str, zip_path: str, exclude_patterns: List[str] = None) -> Dict:
    """Create a zip archive while preserving directory structure."""
    if exclude_patterns is None:
        exclude_patterns = ['.DS_Store', '__pycache__', '.git']
    
    zip_info = {
        'source_dir': source_dir,
        'zip_path': zip_path,
        'created_at': datetime.now().isoformat(),
        'files_added': 0,
        'original_size': 0,
        'compressed_size': 0,
        'compression_ratio': 0.0,
        'files_list': []
    }
    
    print(f"🗜️  Creating zip archive...")
    print(f"   📁 Source: {source_dir}")
    print(f"   📦 Destination: {zip_path}")
    
    # Get total number of files for progress bar
    total_files = sum(len(files) for _, _, files in os.walk(source_dir))
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED, compresslevel=6) as zipf:
        with tqdm(total=total_files, desc="Adding files") as pbar:
            for root, dirs, files in os.walk(source_dir):
                # Remove excluded directories
                dirs[:] = [d for d in dirs if not any(pattern in d for pattern in exclude_patterns)]
                
                for file in files:
                    # Skip excluded files
                    if any(pattern in file for pattern in exclude_patterns):
                        pbar.update(1)
                        continue
                    
                    file_path = os.path.join(root, file)
                    
                    # Calculate relative path to preserve structure
                    arcname = os.path.relpath(file_path, source_dir)
                    
                    try:
                        # Add file to zip
                        zipf.write(file_path, arcname)
                        
                        # Update statistics
                        file_size = os.path.getsize(file_path)
                        zip_info['files_added'] += 1
                        zip_info['original_size'] += file_size
                        zip_info['files_list'].append({
                            'path': arcname,
                            'size': file_size,
                            'hash': calculate_file_hash(file_path)
                        })
                        
                    except Exception as e:
                        print(f"⚠️  Warning: Could not add {file_path}: {e}")
                    
                    pbar.update(1)
    
    # Calculate final statistics
    if os.path.exists(zip_path):
        zip_info['compressed_size'] = os.path.getsize(zip_path)
        if zip_info['original_size'] > 0:
            zip_info['compression_ratio'] = (
                1 - zip_info['compressed_size'] / zip_info['original_size']
            ) * 100
    
    print(f"✅ Zip archive created successfully!")
    print(f"   📄 Files added: {zip_info['files_added']}")
    print(f"   📏 Original size: {format_bytes(zip_info['original_size'])}")
    print(f"   🗜️  Compressed size: {format_bytes(zip_info['compressed_size'])}")
    print(f"   📊 Compression ratio: {zip_info['compression_ratio']:.1f}%")
    
    return zip_info

print("🗜️  Zip creation functions ready")


## 6. AWS S3 Upload Functions


In [None]:
def initialize_s3_client():
    """Initialize and test S3 client."""
    try:
        s3_client = boto3.client(
            's3',
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
            region_name=AWS_REGION
        )
        
        # Test connection by listing buckets
        response = s3_client.list_buckets()
        
        # Check if our target bucket exists
        bucket_names = [bucket['Name'] for bucket in response['Buckets']]
        
        if S3_BUCKET_NAME in bucket_names:
            print(f"✅ S3 client initialized successfully")
            print(f"🪣 Target bucket '{S3_BUCKET_NAME}' found")
        else:
            print(f"⚠️  Warning: Bucket '{S3_BUCKET_NAME}' not found in your account")
            print(f"Available buckets: {bucket_names}")
            
        return s3_client
        
    except Exception as e:
        print(f"❌ Failed to initialize S3 client: {e}")
        return None

def upload_file_to_s3(s3_client, local_path: str, s3_key: str) -> Dict:
    """Upload a single file to S3 with progress tracking."""
    upload_info = {
        'local_path': local_path,
        's3_key': s3_key,
        'bucket': S3_BUCKET_NAME,
        'success': False,
        'upload_time': None,
        'file_size': 0,
        'error': None
    }
    
    try:
        file_size = os.path.getsize(local_path)
        upload_info['file_size'] = file_size
        
        print(f"⬆️  Uploading: {os.path.basename(local_path)} ({format_bytes(file_size)})")
        
        start_time = datetime.now()
        
        # Upload file
        s3_client.upload_file(local_path, S3_BUCKET_NAME, s3_key)
        
        end_time = datetime.now()
        upload_info['upload_time'] = (end_time - start_time).total_seconds()
        upload_info['success'] = True
        
        print(f"✅ Upload completed in {upload_info['upload_time']:.2f} seconds")
        print(f"   🔗 S3 Location: s3://{S3_BUCKET_NAME}/{s3_key}")
        
    except Exception as e:
        upload_info['error'] = str(e)
        print(f"❌ Upload failed: {e}")
    
    return upload_info

print("☁️  S3 upload functions ready")


## 7. Main Execution: Create Zip Archives


In [None]:
# Create backup directory
create_backup_directory()

# Create timestamp for this backup session
backup_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Dictionary to store all zip information
zip_archives = {}

print(f"🚀 Starting backup process at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"📅 Backup session ID: {backup_timestamp}\n")

# Create zip archives for each research question
for rq in ['rq2', 'rq3']:
    rq_dir = os.path.join(ARTIFACTS_DIR, rq)
    
    if os.path.exists(rq_dir) and os.listdir(rq_dir):  # Check if directory exists and is not empty
        print(f"\n📦 Processing {rq.upper()}...")
        
        # Create zip filename
        zip_filename = f"artifacts_{rq}_{backup_timestamp}.zip"
        zip_path = os.path.join(BACKUP_DIR, zip_filename)
        
        # Create zip archive
        zip_info = create_zip_archive(rq_dir, zip_path)
        zip_archives[rq] = zip_info
        
        print(f"✅ {rq.upper()} archive created: {zip_filename}")
    else:
        print(f"⚠️  Skipping {rq.upper()}: Directory empty or doesn't exist")

print(f"\n🎉 Zip creation completed!")
print(f"📁 All archives saved to: {BACKUP_DIR}")

# Display summary
total_original_size = sum(info['original_size'] for info in zip_archives.values())
total_compressed_size = sum(info['compressed_size'] for info in zip_archives.values())
total_files = sum(info['files_added'] for info in zip_archives.values())

print(f"\n📊 BACKUP SUMMARY:")
print(f"   📦 Archives created: {len(zip_archives)}")
print(f"   📄 Total files: {total_files}")
print(f"   📏 Original size: {format_bytes(total_original_size)}")
print(f"   🗜️  Compressed size: {format_bytes(total_compressed_size)}")
if total_original_size > 0:
    overall_compression = (1 - total_compressed_size / total_original_size) * 100
    print(f"   📊 Overall compression: {overall_compression:.1f}%")


## 8. Initialize S3 Connection


In [None]:
# Initialize S3 client
print("🔐 Initializing AWS S3 connection...")
s3_client = initialize_s3_client()

if s3_client is None:
    print("\n❌ Cannot proceed with S3 upload. Please check your AWS credentials.")
    print("\n🔧 Troubleshooting steps:")
    print("   1. Verify AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY")
    print("   2. Check if the specified region is correct")
    print("   3. Ensure your AWS account has S3 permissions")
    print("   4. Verify the bucket name exists and you have access")
else:
    print("\n🎯 Ready to upload to S3!")


## 9. Upload to S3


In [None]:
if s3_client and zip_archives:
    print(f"☁️  Starting S3 upload process...")
    print(f"🪣 Target bucket: {S3_BUCKET_NAME}")
    print(f"📂 S3 prefix: {S3_KEY_PREFIX}\n")
    
    upload_results = []
    
    for rq, zip_info in zip_archives.items():
        print(f"\n⬆️  Uploading {rq.upper()} archive...")
        
        # Create S3 key maintaining directory structure
        zip_filename = os.path.basename(zip_info['zip_path'])
        s3_key = f"{S3_KEY_PREFIX}/artifacts/{rq}/{zip_filename}"
        
        # Upload file
        upload_result = upload_file_to_s3(s3_client, zip_info['zip_path'], s3_key)
        upload_results.append(upload_result)
    
    # Create and save upload manifest
    manifest = {
        'upload_session': {
            'timestamp': datetime.now().isoformat(),
            'project': 'HistoPathology Research',
            's3_bucket': S3_BUCKET_NAME,
            's3_prefix': S3_KEY_PREFIX
        },
        'summary': {
            'total_files': len(upload_results),
            'successful_uploads': sum(1 for r in upload_results if r['success']),
            'failed_uploads': sum(1 for r in upload_results if not r['success']),
            'total_size': sum(r['file_size'] for r in upload_results),
            'total_upload_time': sum(r.get('upload_time', 0) for r in upload_results)
        },
        'uploads': upload_results
    }
    
    manifest_path = os.path.join(BACKUP_DIR, f"upload_manifest_{backup_timestamp}.json")
    
    with open(manifest_path, 'w') as f:
        json.dump(manifest, f, indent=2)
    
    # Upload manifest to S3
    manifest_s3_key = f"{S3_KEY_PREFIX}/manifests/upload_manifest_{backup_timestamp}.json"
    manifest_upload = upload_file_to_s3(s3_client, manifest_path, manifest_s3_key)
    
    print(f"\n🎉 S3 Upload Process Completed!")
    print(f"\n📊 UPLOAD SUMMARY:")
    print(f"   ✅ Successful uploads: {manifest['summary']['successful_uploads']}")
    print(f"   ❌ Failed uploads: {manifest['summary']['failed_uploads']}")
    print(f"   📏 Total uploaded: {format_bytes(manifest['summary']['total_size'])}")
    print(f"   ⏱️  Total time: {manifest['summary']['total_upload_time']:.2f} seconds")
    print(f"   📄 Manifest saved: {manifest_path}")
    
    # Display S3 locations
    print(f"\n🔗 S3 LOCATIONS:")
    for result in upload_results:
        if result['success']:
            print(f"   📦 s3://{result['bucket']}/{result['s3_key']}")
    
    if manifest_upload['success']:
        print(f"   📄 s3://{S3_BUCKET_NAME}/{manifest_s3_key}")
    
else:
    if not s3_client:
        print("❌ S3 client not initialized. Cannot upload.")
    if not zip_archives:
        print("❌ No zip archives created. Nothing to upload.")


## 10. Final Summary


In [None]:
print("\n" + "="*60)
print("🎉 MODEL BACKUP AND S3 UPLOAD COMPLETED")
print("="*60)

print(f"\n📅 Session: {backup_timestamp}")
print(f"📁 Local backups: {BACKUP_DIR}")
print(f"☁️  S3 bucket: {S3_BUCKET_NAME}")
print(f"📂 S3 prefix: {S3_KEY_PREFIX}")

if 'zip_archives' in locals() and zip_archives:
    print(f"\n📦 ARCHIVES CREATED:")
    for rq, info in zip_archives.items():
        print(f"   {rq.upper()}: {os.path.basename(info['zip_path'])} ({format_bytes(info['compressed_size'])})")

if 'upload_results' in locals():
    successful_uploads = sum(1 for r in upload_results if r['success'])
    print(f"\n☁️  UPLOADS: {successful_uploads}/{len(upload_results)} successful")

print(f"\n💡 TIP: Keep the manifest file for future reference!")
print(f"📄 Manifest location: {BACKUP_DIR}/upload_manifest_{backup_timestamp}.json")
print("="*60)
