In [0]:
%run ../get_user

In [0]:
# Getting the current user
user_email = spark.sql("SELECT current_user()").collect()[0][0]
username = get_username_from_email(user_email)
print(username)

In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
catalog_name = "geospatial"
schema_name = "zoetermeer"

In [0]:
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.monumenten_{username}")

In [0]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, BooleanType
import requests
import os
from urllib.parse import urlparse
import logging

# -------------------------
# Configuration
# -------------------------

# Base path to Unity Catalog volume
SOURCE_TABLE = f"{catalog_name}.{schema_name}.monumenten_{username}"
VOLUME_PATH = f"/Volumes/{catalog_name}/{schema_name}/monumenten_{username}"


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import requests
import os
from urllib.parse import urlparse
import urllib3
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Disable SSL warnings since we're bypassing verification
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def download_image(pic_url, fid):
    """Download image from URL and save to volume"""
    try:
        if not pic_url or not fid:
            return f"Missing URL or FID"
        
        # Get filename from URL
        parsed_url = urlparse(pic_url)
        filename = os.path.basename(parsed_url.path)
        if not filename or '.' not in filename:
            filename = f"image_{hash(pic_url) % 10000}.jpg"
        
        # Create folder for this fid
        folder_path = os.path.join(VOLUME_PATH, str(fid))
        os.makedirs(folder_path, exist_ok=True)
        
        # Full file path
        file_path = os.path.join(folder_path, filename)
        
        # Skip if already exists
        if os.path.exists(file_path):
            return f"Already exists: {filename}"
        
        # Download image with SSL verification disabled
        headers = {'User-Agent': 'Mozilla/5.0 (compatible)'}
        response = requests.get(pic_url, headers=headers, timeout=30, verify=False)
        response.raise_for_status()
        
        # Save file
        with open(file_path, 'wb') as f:
            f.write(response.content)
        
        return f"Downloaded: {filename}"
        
    except Exception as e:
        return f"Error: {str(e)}"

# Main execution
print("Reading table...")
df = spark.table(SOURCE_TABLE)

print(f"Table columns: {df.columns}")
print(f"Total rows: {df.count()}")

# Show sample data
print("\nSample data:")
df.select("pic_url", "fid").show(5, truncate=False)

# Get all records with pic_url and fid
records = df.select("pic_url", "fid").filter(
    col("pic_url").isNotNull() & 
    col("fid").isNotNull()
).collect()

print(f"\nFound {len(records)} records to process")

# Ensure volume directory exists
os.makedirs(VOLUME_PATH, exist_ok=True)

# Create a session for connection reuse
session = requests.Session()
session.verify = False
session.headers.update({'User-Agent': 'Mozilla/5.0 (compatible)'})

def download_single_image(args):
    """Download a single image - designed for parallel execution"""
    row, index, total = args
    pic_url, fid = row.pic_url, row.fid
    
    try:
        if not pic_url or not fid:
            return index, f"Missing URL or FID"
        
        # Get filename from URL
        parsed_url = urlparse(pic_url)
        filename = os.path.basename(parsed_url.path)
        if not filename or '.' not in filename:
            filename = f"image_{hash(pic_url) % 10000}.jpg"
        
        # Create folder for this fid
        folder_path = os.path.join(VOLUME_PATH, str(fid))
        os.makedirs(folder_path, exist_ok=True)
        
        # Full file path
        file_path = os.path.join(folder_path, filename)
        
        # Skip if already exists
        if os.path.exists(file_path):
            return index, f"Already exists: {filename}"
        
        # Download image using session
        response = session.get(pic_url, timeout=20)
        response.raise_for_status()
        
        # Save file
        with open(file_path, 'wb') as f:
            f.write(response.content)
        
        return index, f"Downloaded: {filename}"
        
    except Exception as e:
        return index, f"Error: {str(e)}"

# Process images with parallel downloads
print("Starting parallel downloads...")
start_time = time.time()

success_count = 0
error_count = 0

# Use ThreadPoolExecutor for parallel downloads
max_workers = min(10, len(records))  # Max 10 concurrent downloads
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all download tasks
    future_to_args = {
        executor.submit(download_single_image, (row, i, len(records))): (row, i) 
        for i, row in enumerate(records, 1)
    }
    
    # Process completed downloads
    for future in as_completed(future_to_args):
        row, i = future_to_args[future]
        try:
            index, result = future.result()
            
            if result.startswith("Downloaded") or result.startswith("Already exists"):
                success_count += 1
                print(f"✓ {i}/{len(records)}: FID={row.fid} - {result}")
            else:
                error_count += 1
                print(f"✗ {i}/{len(records)}: FID={row.fid} - {result}")
                
        except Exception as e:
            error_count += 1
            print(f"✗ {i}/{len(records)}: FID={row.fid} - Unexpected error: {e}")

end_time = time.time()
duration = end_time - start_time

print(f"\nCompleted in {duration:.1f} seconds!")
print(f"Success: {success_count}")
print(f"Errors: {error_count}")
print(f"Average time per image: {duration/len(records):.2f} seconds")
print(f"Files saved to: {VOLUME_PATH}")

# Close the session
session.close()