In [1]:
import pandas as pd
import os
import numpy as np
import cv2
from tqdm import tqdm

In [2]:
labels_df = pd.read_csv("TrainLabels.csv")
labels_df.columns = labels_df.columns.str.strip()  # remove leading/trailing spaces
print(labels_df.columns.tolist())

['ClipID', 'Boredom', 'Engagement', 'Confusion', 'Frustration']


In [3]:
DATA_DIR = r"C:\Users\Randall Chiang\Documents\FYP\DAiSEE\Data\Train"

def get_frames_for_clip(clip_path, max_frames=30, img_size=(64, 64), min_frames=5):
    frames = []
    for fname in sorted(os.listdir(clip_path)):
        if fname.endswith(".jpg"):
            img_path = os.path.join(clip_path, fname)
            img = cv2.imread(img_path)
            if img is None:
                continue
            img = cv2.resize(img, img_size)
            frames.append(img)

    if len(frames) == 0:
        return None  # skip if no frames

    if len(frames) < min_frames:
        return None  # skip if too few frames

    frames = np.array(frames)

    # If more than max_frames â†’ trim
    if len(frames) > max_frames:
        frames = frames[:max_frames]
    # If fewer than max_frames â†’ pad
    elif len(frames) < max_frames:
        pad = np.zeros((max_frames - len(frames), *img_size, 3), dtype=np.uint8)
        frames = np.concatenate([frames, pad], axis=0)

    return frames


SAVE_PATH = "daisee_dataset.npz"

if os.path.exists(SAVE_PATH):
    # ðŸ”¹ Load preprocessed dataset if it exists
    data = np.load(SAVE_PATH, allow_pickle=True)
    X, y = data["X"], data["y"]
    print("âœ… Loaded preprocessed dataset")
else:
    # ðŸ”¹ Build dataset if not saved yet
    X, y = [], []

    for i, row in tqdm(labels_df.iterrows(), total=len(labels_df), desc="Processing clips"):
        clip_id = row["ClipID"].replace(".avi", "")
        prefix_folder = clip_id[:6]
        clip_path = os.path.join(DATA_DIR, prefix_folder, clip_id)

        if os.path.exists(clip_path) and os.path.isdir(clip_path):
            frames = get_frames_for_clip(clip_path)
            if frames is not None:
                X.append(frames)
                label_vec = [
                    row["Boredom"],
                    row["Engagement"],
                    row["Confusion"],
                    row["Frustration"]
                ]
                y.append(label_vec)

    X = np.array(X)
    y = np.array(y)

    # ðŸ”¹ Save dataset
    np.savez_compressed(SAVE_PATH, X=X, y=y)
    print(f"âœ… Preprocessed dataset saved to {SAVE_PATH}")

print("X shape:", X.shape)
print("y shape:", y.shape)

Processing clips: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5358/5358 [14:38<00:00,  6.10it/s]  


âœ… Preprocessed dataset saved to daisee_dataset.npz
X shape: (1941, 30, 64, 64, 3)
y shape: (1941, 4)


In [4]:
np.save("X.npy", X)
np.save("Y.npy", y)