In [None]:
from PIL import Image
from io import BytesIO
import boto3
import os

# Set up S3 client
s3 = boto3.client('s3')
bucket_name = 'csml-data-bucket'
prefix = 'preprocessed/preprocessed/unprocessed/'

def list_image_files_in_s3(prefix):
    paginator = s3.get_paginator('list_objects_v2')
    valid_extensions = ['.jpg', '.jpeg', '.png']
    files = []
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
    total_files = 0
    for page in page_iterator:
        page_files = [content['Key'] for content in page.get('Contents', []) if any(content['Key'].lower().endswith(ext) for ext in valid_extensions)]
        files.extend(page_files)
        total_files += len(page_files)
    print(f"Total number of image files found: {total_files}")
    return files

def resize_and_pad_image(image, target_size=224):
    original_width, original_height = image.size
    aspect_ratio = original_width / original_height
    if original_width > original_height:
        new_width = target_size
        new_height = int(target_size / aspect_ratio)
    else:
        new_height = target_size
        new_width = int(target_size * aspect_ratio)
    img_resized = image.resize((new_width, new_height), Image.LANCZOS)  # Fixed here
    new_img = Image.new("RGB", (target_size, target_size), (255, 255, 255))
    paste_x = (target_size - new_width) // 2
    paste_y = (target_size - new_height) // 2
    new_img.paste(img_resized, (paste_x, paste_y))
    return new_img

def resize_pad_and_upload_image(key, target_size=224):
    try:
        print(f"Processing file: {key}")
        # Download the image from S3
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        img = Image.open(BytesIO(obj['Body'].read())).convert('RGB')
        # Resize and pad the image
        img_resized_padded = resize_and_pad_image(img, target_size)
        # Save the processed image to a buffer
        buffer = BytesIO()
        img_resized_padded.save(buffer, format='JPEG')
        # Upload the processed image to a new location in S3
        new_folder_name = 'preprocessed_new_without'
        destination_key = f'{prefix}preprocessed/{new_folder_name}/{os.path.basename(key)}'
        print(f"Uploading to: {destination_key}")
        s3.put_object(Bucket=bucket_name, Key=destination_key, Body=buffer.getvalue())
        print(f"Successfully processed and uploaded: {key}")
    except Exception as e:
        print(f"Error processing file {key}: {e}")

# List valid image files and process them
files = list_image_files_in_s3(prefix)
for file in files:
    resize_pad_and_upload_image(file, target_size=224)


Total number of image files found: 65
Processing file: preprocessed/preprocessed/unprocessed/2023_0524_163435_001.JPG
Uploading to: preprocessed/preprocessed/unprocessed/preprocessed/preprocessed_new_without/2023_0524_163435_001.JPG
Successfully processed and uploaded: preprocessed/preprocessed/unprocessed/2023_0524_163435_001.JPG
Processing file: preprocessed/preprocessed/unprocessed/2023_0524_163517_001.JPG
Uploading to: preprocessed/preprocessed/unprocessed/preprocessed/preprocessed_new_without/2023_0524_163517_001.JPG
Successfully processed and uploaded: preprocessed/preprocessed/unprocessed/2023_0524_163517_001.JPG
Processing file: preprocessed/preprocessed/unprocessed/2023_0524_163534_001.JPG
Uploading to: preprocessed/preprocessed/unprocessed/preprocessed/preprocessed_new_without/2023_0524_163534_001.JPG
Successfully processed and uploaded: preprocessed/preprocessed/unprocessed/2023_0524_163534_001.JPG
Processing file: preprocessed/preprocessed/unprocessed/2023_0524_163609_001.J

Uploading to: preprocessed/preprocessed/unprocessed/preprocessed/preprocessed_new_without/DSCF0033.JPG
Successfully processed and uploaded: preprocessed/preprocessed/unprocessed/DSCF0033.JPG
Processing file: preprocessed/preprocessed/unprocessed/DSCF0034.JPG
Uploading to: preprocessed/preprocessed/unprocessed/preprocessed/preprocessed_new_without/DSCF0034.JPG
Successfully processed and uploaded: preprocessed/preprocessed/unprocessed/DSCF0034.JPG
Processing file: preprocessed/preprocessed/unprocessed/DSCF0035.JPG
Uploading to: preprocessed/preprocessed/unprocessed/preprocessed/preprocessed_new_without/DSCF0035.JPG
Successfully processed and uploaded: preprocessed/preprocessed/unprocessed/DSCF0035.JPG
Processing file: preprocessed/preprocessed/unprocessed/DSCF0036.JPG
Uploading to: preprocessed/preprocessed/unprocessed/preprocessed/preprocessed_new_without/DSCF0036.JPG
Successfully processed and uploaded: preprocessed/preprocessed/unprocessed/DSCF0036.JPG
Processing file: preprocessed/pr

In [2]:
from PIL import Image
from io import BytesIO
import boto3
import os

# Set up S3 client
s3 = boto3.client('s3')
bucket_name = 'csml-data-bucket'
prefix = 'For Matt with birds/For Matt with birds/'

def list_image_files_in_s3(prefix):
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    # Filter for image file extensions
    valid_extensions = ['.jpg', '.jpeg', '.png']
    files = [content['Key'] for content in response.get('Contents', []) if any(content['Key'].lower().endswith(ext) for ext in valid_extensions)]
    print(f"Number of image files found: {len(files)}")
    return files


files = list_image_files_in_s3(prefix)

Number of image files found: 1000
