## Data Preparation

In [None]:
import os

import numpy as np
import pandas as pd


def load_utkface_dataset(
    dataset_path: str,
    n: int = -1,
    seed: int = 42,
) -> pd.DataFrame:
    paths = np.array([os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith(".jpg")])
    np.random.seed(seed)
    np.random.shuffle(paths)

    if n > 0:
        paths = paths[:n]

    data = []
    for path in paths:
        filename = os.path.basename(path).split(".")[0]
        try:
            age, gender, *_ = filename.split("_")
            data.append([path, int(age), int(gender)])
        except ValueError:
            print(f"Invalid filename: {filename}")
            continue

    return pd.DataFrame(data, columns=["path", "age", "gender"])


df = load_utkface_dataset("../images/utkface", n=1000)
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_label_distributions(df: pd.DataFrame):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

    sns.histplot(df, x="age", ax=axes[0])
    axes[0].set_xlabel("Age")
    axes[0].set_ylabel("")

    sns.countplot(df, x="gender", ax=axes[1])
    axes[1].set_xlabel("Gender")
    axes[1].set_ylabel("")

    fig.suptitle(f"Class Distributions (n={len(df)})")
    plt.tight_layout()
    plt.show()


plot_label_distributions(df)

In [None]:
df["age_group"] = pd.cut(df["age"], bins=range(0, 121, 20), labels=range(0, 120, 20), right=False)
df.head()

In [None]:
from imblearn.under_sampling import RandomUnderSampler


def resample_dataset(
    dataset: pd.DataFrame,
    sampler_class,
    label_cols: list = ["age_group", "gender"],
    seed: int = 42,
) -> pd.DataFrame:
    combined_target = dataset[label_cols].astype(str).agg("_".join, axis=1)
    sampler = sampler_class(random_state=seed)
    X_res, _ = sampler.fit_resample(dataset, combined_target)
    return X_res


balanced_data = resample_dataset(df, RandomUnderSampler)
plot_label_distributions(balanced_data)

In [None]:
import io

from PIL import Image
from tqdm import tqdm


tqdm.pandas()


def load_image_array(path, color_mode="L", target_size=(96, 96)):
    with open(path, "rb") as f:
        image = Image.open(io.BytesIO(f.read()))
        image = image.convert(color_mode).resize(target_size)
        image_array = np.array(image, dtype=np.float32) / 255.0
        if color_mode == "L":
            image_array = np.stack([image_array] * 3, axis=-1)
    return image_array


balanced_data.loc[:, "image"] = balanced_data["path"].progress_apply(load_image_array)
balanced_data.head()

In [None]:
def display_images(
    df: pd.DataFrame,
    rows: int,
    cols: int,
    seed: int = 42,
    image_col: str = "image",
):
    n_images = rows * cols
    sample_df = df.sample(n=n_images, random_state=seed)

    fig, axes = plt.subplots(rows, cols, figsize=(cols * 2, rows * 2))
    axes_flat = axes.flatten() if n_images > 1 else [axes]

    for idx, (_, row) in enumerate(sample_df.iterrows()):
        if idx >= n_images:
            break

        axes_flat[idx].imshow(row[image_col], cmap="gray")
        axes_flat[idx].axis("off")

    for idx in range(len(sample_df), len(axes_flat)):
        axes_flat[idx].axis("off")

    plt.tight_layout()
    plt.show()

display_images(balanced_data, 3, 5)

In [None]:
from keras.api.utils import to_categorical
from sklearn.model_selection import train_test_split


def split_data(
    df: pd.DataFrame,
    image_col: str = "image",
    label_col: str = "gender",
    test_size: float = 0.1,
    validation_size: float = 0.3,
    seed: int = 42,
) -> tuple:
    images = np.stack(df[image_col].values)
    labels = df[label_col].values

    X_train, X_temp, y_train, y_temp = train_test_split(
        images,
        labels,
        test_size=test_size,
        random_state=seed,
        stratify=labels,
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp,
        y_temp,
        test_size=validation_size,
        random_state=seed,
        stratify=y_temp,
    )

    y_train = to_categorical(y_train, num_classes=2)
    y_val = to_categorical(y_val, num_classes=2)
    y_test = to_categorical(y_test, num_classes=2)
    return X_train, X_val, X_test, y_train, y_val, y_test


X_train, X_val, X_test, y_train, y_val, y_test = split_data(balanced_data)

## Model Training

In [None]:
from keras.api.layers import Dense, Dropout, GlobalAveragePooling2D, Flatten
from keras.api.models import Model
from keras.api.optimizers import Adam
from keras.api.regularizers import l2


def build_model(hp, arch):
    base_model = arch(weights="imagenet", include_top=False, input_shape=(96, 96, 3))
    base_model.trainable = False

    x = GlobalAveragePooling2D()(base_model.output)

    dense_unit = hp.Int("dense_unit", min_value=256, max_value=512, step=64)
    x = Dense(dense_unit, activation="relu", kernel_regularizer=l2(0.01))(x)

    dropout_rate = hp.Float("dropout_rate", min_value=0.4, max_value=0.6, step=0.1)
    x = Dropout(dropout_rate)(x)

    output = Dense(2, activation="softmax")(x)
    model = Model(inputs=base_model.input, outputs=output)

    learning_rate = hp.Choice("learning_rate", values=[1e-4, 5e-4, 1e-3])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="categorical_crossentropy", metrics=["accuracy"])
    return model

In [None]:
import keras_tuner as kt
from keras.api.callbacks import EarlyStopping, ModelCheckpoint


def find_best_hyperparameters(
    arch,
    X_train,
    y_train,
    X_val,
    y_val,
    epochs: int = 10,
    batch_size: int = 32,
):
    log_dir = os.path.join("logs", arch.__name__.lower())
    model_dir = os.path.join(log_dir, f"{arch.__name__.lower()}_best_model.h5")

    tuner = kt.Hyperband(
        lambda x: build_model(x, arch),
        objective="val_accuracy",
        overwrite=True,
        directory=log_dir,
        project_name="history",
    )

    callbacks = [
        EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1),
        ModelCheckpoint(model_dir, monitor="val_accuracy", save_best_only=True, verbose=1, mode="max"),
    ]

    tuner.search(
        X_train,
        y_train,
        epochs=epochs,
        validation_data=(X_val, y_val),
        batch_size=batch_size,
        callbacks=callbacks,
    )

    return tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
import keras


def tune_architectures(archs):
    best_hps = {}
    for arch in archs:
        best_hps[arch.__name__] = find_best_hyperparameters(arch, X_train, y_train, X_val, y_val)
    return best_hps


architectures = [
    keras.applications.ResNet50V2,
    keras.applications.ResNet101V2,
    keras.applications.ResNet152V2,
    keras.applications.VGG16,
    keras.applications.VGG19,
    keras.applications.InceptionV3,
    keras.applications.InceptionResNetV2,
    keras.applications.Xception,
]

best_hps = tune_architectures(architectures)

In [None]:
# TODO: Plot the best hyperparameters for each architecture

## Model Evaluation

In [None]:
# TODO: Evaluate each model using the test set