In [7]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# 2️⃣ Copy the zip (via the shortcut) into local storage
!cp "/content/drive/MyDrive/UTKface.zip" /content/UTKface.zip

# 3️⃣ Unzip it to /content/UTKface
!unzip -q /content/UTKface.zip -d /content/UTKface


In [17]:
%%writefile prepare_utkface_pkl.py
import os
import random
import argparse
import pickle
from tqdm import tqdm
from PIL import Image
import numpy as np
import torch

def parse_filename(filename):
    parts = filename.split('_')
    age = int(parts[0])
    gender = int(parts[1])
    ethnicity = int(parts[2])
    return age, gender, ethnicity


def load_and_process(path, size):
    img = Image.open(path).convert('RGB')
    img = img.resize((size, size), Image.BILINEAR)
    arr = np.array(img, dtype=np.uint8)
    tensor = torch.from_numpy(arr).permute(2, 0, 1).float().div(255.0)
    return tensor


def main(input_dir: str, output_dir: str, size: int = 224, chunks: int = 24):
    os.makedirs(output_dir, exist_ok=True)
    # Collect all image file paths
    file_paths = []
    for root, _, files in os.walk(input_dir):
        for fname in files:
            if fname.lower().endswith(('.jpg', '.png')):
                file_paths.append(os.path.join(root, fname))
    print(f"Found {len(file_paths)} images under {input_dir}")

    # Shuffle paths
    random.seed(42)
    random.shuffle(file_paths)
    total = len(file_paths)
    chunk_size = total // chunks

    # Process in chunks to limit RAM usage
    for idx in range(chunks):
        start = idx * chunk_size
        end = (idx + 1) * chunk_size if idx < chunks - 1 else total
        chunk_paths = file_paths[start:end]

        images, ages, genders, ethnics = [], [], [], []
        for path in tqdm(chunk_paths, desc=f"Chunk {idx}"):
            name = os.path.basename(path)
            try:
                age, gender, ethnicity = parse_filename(name)
            except:
                continue
            tensor = load_and_process(path, size)
            images.append(tensor)
            ages.append(age)
            genders.append(gender)
            ethnics.append(ethnicity)

        out_path = os.path.join(output_dir, f"{idx}.pkl")
        with open(out_path, 'wb') as f:
            pickle.dump({'images': images,
                         'ages': ages,
                         'genders': genders,
                         'ethnicities': ethnics}, f)
        print(f"Saved chunk {idx} with {len(images)} samples to {out_path}")
        # free memory
        del images, ages, genders, ethnics
        torch.cuda.empty_cache()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_dir', type=str, required=True,
                        help='Root UTKFace folder to recurse')
    parser.add_argument('--output_dir', type=str, required=True,
                        help='Directory to write PKL chunks')
    parser.add_argument('--size', type=int, default=224,
                        help='Image resize dimension')
    parser.add_argument('--chunks', type=int, default=24,
                        help='Number of chunks to split into')
    args = parser.parse_args()
    main(args.input_dir, args.output_dir, args.size, args.chunks)



Overwriting prepare_utkface_pkl.py


In [18]:
# 4️⃣ Run the preprocessing script on the newly unzipped folder
!python prepare_utkface_pkl.py \
    --input_dir "/content/UTKface" \
    --output_dir "/content/drive/MyDrive/pkl_files" \
    --size 224 \
    --chunks 24


Found 24068 images under /content/UTKface
Chunk 0: 100% 1002/1002 [00:12<00:00, 82.44it/s]
Saved chunk 0 with 1002 samples to /content/drive/MyDrive/pkl_files/0.pkl
Chunk 1: 100% 1002/1002 [00:11<00:00, 88.08it/s]
Saved chunk 1 with 1002 samples to /content/drive/MyDrive/pkl_files/1.pkl
Chunk 2: 100% 1002/1002 [00:15<00:00, 66.53it/s]
Saved chunk 2 with 1002 samples to /content/drive/MyDrive/pkl_files/2.pkl
Chunk 3: 100% 1002/1002 [00:11<00:00, 87.15it/s]
Saved chunk 3 with 1002 samples to /content/drive/MyDrive/pkl_files/3.pkl
Chunk 4: 100% 1002/1002 [00:13<00:00, 74.82it/s]
Saved chunk 4 with 1002 samples to /content/drive/MyDrive/pkl_files/4.pkl
Chunk 5: 100% 1002/1002 [00:15<00:00, 65.80it/s]
Saved chunk 5 with 1002 samples to /content/drive/MyDrive/pkl_files/5.pkl
Chunk 6: 100% 1002/1002 [00:13<00:00, 72.64it/s]
Saved chunk 6 with 1002 samples to /content/drive/MyDrive/pkl_files/6.pkl
Chunk 7: 100% 1002/1002 [00:13<00:00, 73.95it/s]
Saved chunk 7 with 1002 samples to /content/driv

pkl file names were changed from num.pkl (5.pkl) to chunk_numnum.pkl (chunk_05.pkl) at some point of our work