# Data download and preprocessing

## Dependancies

In [None]:
# Uncomment the following line to install the kaggle API
#!pip install -q kaggle

## Imports

In [None]:
import cv2
import os
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
from zipfile import ZipFile

## Download

In [None]:
# Uncomment the following line to download the dataset
#!kaggle datasets download -d shmalex/instagram-images

## Unzip

In [None]:
if not os.path.exists("img_data"):
    with ZipFile("instagram-images.zip", "r") as zip:
        zip.extractall("img_data/")
        print("Dataset extracted")
else:
    print("Dataset already extracted")

## Resize images

Resize all images in place to 512x512, if not possible (or the image is corrupted) delete them.

Note: The multithreading implemented here is not true parallelism due to Python's Global Interpreter Lock (GIL).
However, it allows threads to concurrently wait for disk I/O responses, which often takes the most time,
potentially improving overall performance for this I/O-bound task.

In [None]:
import cv2
import os
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

def center_crop(image, crop_size=(512, 512)):
    h, w, _ = image.shape
    ch, cw = crop_size
    start_x = w // 2 - cw // 2
    start_y = h // 2 - ch // 2
    return image[start_y:start_y+ch, start_x:start_x+cw]

def process_image(image_file, crop_size=(512, 512)):
    try:
        image = cv2.imread(str(image_file))
        if image is None:
            raise ValueError("Image could not be read")
        
        if not isinstance(image, np.ndarray):
            raise ValueError("Image is not a valid numpy array")
        
        h, w, _ = image.shape
        if h == crop_size[0] and w == crop_size[1]:
            return 'skipped'
        elif h < crop_size[0] or w < crop_size[1]:
            os.remove(image_file)
            return 'deleted'
        else:
            cropped_image = center_crop(image, crop_size)
            cv2.imwrite(str(image_file), cropped_image)
            return 'resized'
    
    except Exception as e:
        print(f"Error processing {image_file}: {str(e)}")
        os.remove(image_file)
        return 'corrupted'

def process_images(input_folder, crop_size=(512, 512), max_workers=4):
    input_path = Path(input_folder)
    image_files = list(input_path.rglob('*'))
    image_files = [f for f in image_files if f.is_file() and f.suffix.lower() in ['.jpg', '.jpeg']]
    
    results = {'deleted': 0, 'resized': 0, 'corrupted': 0, 'skipped': 0}
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(process_image, image_file, crop_size): image_file for image_file in image_files}
        
        with tqdm(total=len(image_files), desc="Processing Images") as pbar:
            for future in as_completed(future_to_file):
                result = future.result()
                results[result] += 1
                pbar.update(1)
    
    return results

input_folder = 'img_data'
results = process_images(input_folder, max_workers=os.cpu_count())

print(f"Number of images deleted (too small): {results['deleted']}")
print(f"Number of images resized: {results['resized']}")
print(f"Number of corrupted images deleted: {results['corrupted']}")
print(f"Number of images skipped (already correct size): {results['skipped']}")