# FETCH TRAINING DATA

This script performs various tasks related to handling and processing image data, including downloading a zip file from an Amazon S3 bucket, extracting the contents of the zip file, creating directories, splitting and moving images into training, validation, and test sets, and uploading files back to S3. The data generated is structured and ready for consumption by our training process.

In [24]:
# Importing libraries:
import os
import shutil
import zipfile
import boto3
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [21]:
def download_zip_from_s3(s3_client, bucket_name, zip_file_key, zip_file_path):
    """Download zip file from S3."""
    s3_client.download_file(bucket_name, zip_file_key, zip_file_path)

def extract_zip_file(zip_file_path, extract_directory):
    """Extract zip file to specified directory."""
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_directory)

def create_directories(directories):
    """Create directories if they do not exist."""
    for directory in directories:
        os.makedirs(directory, exist_ok=True)

def split_and_move_images(category_path, train_dir, val_dir, test_dir, category, sample_fraction):
    """Split images into training, validation, and test sets, sample them, and move them to respective directories."""
    images = os.listdir(category_path)
    sampled_images = sample_images(images, sample_fraction)
    
    train_images, temp_images = train_test_split(sampled_images, test_size=0.4, random_state=23)
    val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=23)

    os.makedirs(os.path.join(train_dir, category), exist_ok=True)
    os.makedirs(os.path.join(val_dir, category), exist_ok=True)
    os.makedirs(os.path.join(test_dir, category), exist_ok=True)

    for image in train_images:
        shutil.move(os.path.join(category_path, image), os.path.join(train_dir, category, image))
    for image in val_images:
        shutil.move(os.path.join(category_path, image), os.path.join(val_dir, category, image))
    for image in test_images:
        shutil.move(os.path.join(category_path, image), os.path.join(test_dir, category, image))

def sample_images(images, sample_fraction):
    """Sample a fraction of images randomly."""
    sample_size = int(len(images) * sample_fraction)
    return random.sample(images, sample_size)

def zip_directory(directory_path, zip_file_path):
    """Zip the contents of a directory."""
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, directory_path)
                zipf.write(file_path, arcname)

def upload_zip_to_s3(s3_client, bucket_name, zip_file_path, zip_file_key):
    """Upload zip file to S3."""
    s3_client.upload_file(zip_file_path, bucket_name, zip_file_key)


def zip_directory(directory_to_zip, zip_file_name):
    try:
        # Create a zip file
        shutil.make_archive(zip_file_name.replace('.zip', ''), 'zip', directory_to_zip)
        print(f"Directory {directory_to_zip} has been zipped into {zip_file_name}")
        return True
    except Exception as e:
        print(f"An error occurred while zipping the directory: {str(e)}")
        return False

def upload_to_s3(zip_file_name, s3_bucket, s3_object_name):
    try:
        # Initialize a session using Amazon S3
        s3_client = boto3.client('s3')
        # Upload the zip file to S3
        s3_client.upload_file(zip_file_name, s3_bucket, s3_object_name)
        print(f"File {zip_file_name} has been uploaded to s3://{s3_bucket}/{s3_object_name}")
        return True
    except Exception as e:
        print(f"An error occurred while uploading the file to S3: {str(e)}")
        return False

def create_balanced_dataset(base_directory, target_sample_size, output_directory, category_counts):
    # Filter out categories with fewer than 1000 images
    filtered_categories = {k: v for k, v in category_counts.items() if v >= 1000}
    
    # Create output directories if they don't exist
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(output_directory, split), exist_ok=True)
    
    # Dictionary to store selected samples
    sampled_images = defaultdict(list)
    
    # Sample images from each category and split into train, val, test sets
    for category, count in filtered_categories.items():
        category_path = os.path.join(base_directory, category)
        if not os.path.exists(category_path):
            continue
        images = os.listdir(category_path)
        sampled_images[category] = random.sample(images, min(target_sample_size, count))
        
        train_images, temp_images = train_test_split(sampled_images[category], test_size=0.4, random_state=23)
        val_images, test_images = train_test_split(temp_images, test_size=0.5, random_state=23)
        
        for split, split_images in zip(['train', 'val', 'test'], [train_images, val_images, test_images]):
            split_dir = os.path.join(output_directory, split, category)
            os.makedirs(split_dir, exist_ok=True)
            for image in split_images:
                shutil.copy(os.path.join(category_path, image), os.path.join(split_dir, image))
    
    return sampled_images

def count_images_in_categories(base_directory):
    """Count the number of images in each category."""
    category_counts = {}
    for category in os.listdir(base_directory):
        category_path = os.path.join(base_directory, category)
        if os.path.isdir(category_path):
            image_count = len([name for name in os.listdir(category_path) if os.path.isfile(os.path.join(category_path, name))])
            category_counts[category] = image_count
    return category_counts

In [5]:
s3_client = boto3.client('s3')
bucket_name = 'awsmlnn-dev'
zip_file_key = 'data/Re-PolyVore.zip'
zip_file_path = '/tmp/Re-PolyVore.zip'

In [6]:
download_zip_from_s3(s3_client, bucket_name, zip_file_key, zip_file_path)
data_directory = '/home/sagemaker-user/AWSNN/DL/Project/data'
extract_directory = os.path.join(data_directory, 'raw_data')
extract_zip_file(zip_file_path, extract_directory)
raw_data_categories = os.path.join(extract_directory, 'Re-PolyVore')
categories = os.listdir(raw_data_categories)

In [7]:
categories

['bag',
 'bracelet',
 'brooch',
 'dress',
 'earrings',
 'eyewear',
 'gloves',
 'hairwear',
 'hats',
 'jumpsuit',
 'legwear',
 'necklace',
 'neckwear',
 'outwear',
 'pants',
 'rings',
 'shoes',
 'skirt',
 'top',
 'watches']

In [8]:
split_data_directory = os.path.join(data_directory, 'split_data_sample')
train_directory = os.path.join(split_data_directory, 'train')
val_directory = os.path.join(split_data_directory, 'validation')
test_directory = os.path.join(split_data_directory, 'test')

In [10]:
category_counts  = count_images_in_categories(raw_data_categories)

In [11]:
category_counts

{'bag': 12762,
 'bracelet': 3105,
 'brooch': 598,
 'dress': 4488,
 'earrings': 3306,
 'eyewear': 4009,
 'gloves': 233,
 'hairwear': 416,
 'hats': 1749,
 'jumpsuit': 179,
 'legwear': 122,
 'necklace': 2799,
 'neckwear': 714,
 'outwear': 6102,
 'pants': 5375,
 'rings': 1937,
 'shoes': 12082,
 'skirt': 3185,
 'top': 11639,
 'watches': 1375}

In [17]:
base_directory = raw_data_categories
output_directory = '/home/sagemaker-user/AWSNN/DL/Project/data/sample_data'

In [38]:
target_sample_size = 2000
sampled_images = create_balanced_dataset(base_directory, target_sample_size, output_directory,category_counts)

# Print out the counts of sampled images
for category, images in sampled_images.items():
    print(f"Category: {category}, Sampled Count: {len(images)}")

Category: bag, Sampled Count: 2000
Category: bracelet, Sampled Count: 2000
Category: dress, Sampled Count: 2000
Category: earrings, Sampled Count: 2000
Category: eyewear, Sampled Count: 2000
Category: hats, Sampled Count: 1749
Category: necklace, Sampled Count: 2000
Category: outwear, Sampled Count: 2000
Category: pants, Sampled Count: 2000
Category: rings, Sampled Count: 1937
Category: shoes, Sampled Count: 2000
Category: skirt, Sampled Count: 2000
Category: top, Sampled Count: 2000
Category: watches, Sampled Count: 1375


In [16]:
count_images_in_categories(output_directory + "/test")

{'bag': 400,
 'bracelet': 400,
 'dress': 400,
 'earrings': 400,
 'eyewear': 400,
 'hats': 350,
 'necklace': 400,
 'outwear': 400,
 'pants': 400,
 'rings': 388,
 'shoes': 400,
 'skirt': 400,
 'top': 400,
 'watches': 275}

In [12]:
count_images_in_categories(output_directory + "/train")

{'bag': 1200,
 'bracelet': 1200,
 'dress': 1200,
 'earrings': 1200,
 'eyewear': 1200,
 'hats': 1049,
 'necklace': 1200,
 'outwear': 1200,
 'pants': 1200,
 'rings': 1162,
 'shoes': 1200,
 'skirt': 1200,
 'top': 1200,
 'watches': 825}

In [26]:
directory_to_zip = '/home/sagemaker-user/AWSNN/DL/Project/data/sample_data'
zip_file_name = '/home/sagemaker-user/AWSNN/DL/Project/data/sample_data/data_project.zip'
s3_bucket = 'awsmlnn-dev'
s3_object_name = 'data/data_project.zip'

zip_directory(directory_to_zip, zip_file_name)

In [28]:
upload_to_s3(zip_file_name, s3_bucket, s3_object_name)

File /home/sagemaker-user/AWSNN/DL/Project/data/testing/data_project.zip has been uploaded to s3://awsmlnn-dev/data/data_project.zip


True