# Data Loader & Pre-processing

In [2]:
# Importing Dependencies

import os
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import GroupShuffleSplit

In [4]:


df = pd.read_csv("/Users/sarvesh/Desktop/GitHub/lego-minifigure-finder/data/dataset.csv")   # columns: filename,label

df.head()

Unnamed: 0,filename,label
0,fig001_001.jpg,1
1,fig001_002.jpg,1
2,fig001_003.jpg,1
3,fig001_004.jpg,1
4,fig001_005.jpg,1


In [5]:
# Extract group IDs:
# - positives: group by parsed "figXXX"
# - negatives: give each image a unique group so they split randomly
fig_pat = re.compile(r"^fig(\d{3})_")

groups = []
for fname, label in zip(df["filename"], df["label"]):
    m = fig_pat.match(fname)
    if label == 1 and m:             # positive
        groups.append(int(m.group(1)))  # e.g., 3 for fig003
    else:                             # negative (funko_, lego_build_, easy_, etc.)
        # Make each negative its own group to avoid coupling them unnaturally
        # (unique per row ensures a normal random split for negatives)
        groups.append(hash(fname) & 0x7fffffff)

df["group"] = groups

In [14]:
df.head(15)

Unnamed: 0,filename,label,group
0,fig001_001.jpg,1,1
1,fig001_002.jpg,1,1
2,fig001_003.jpg,1,1
3,fig001_004.jpg,1,1
4,fig001_005.jpg,1,1
5,fig001_006.jpg,1,1
6,fig001_007.jpg,1,1
7,fig001_008.jpg,1,1
8,fig001_009.jpg,1,1
9,fig001_010.jpg,1,1


In [15]:
# 70/15/15 split using group-aware logic
gss = GroupShuffleSplit(n_splits=1, test_size=0.30, random_state=42)
train_idx, temp_idx = next(gss.split(df, groups=df["group"]))
train_df, temp_df = df.iloc[train_idx], df.iloc[temp_idx]


gss2 = GroupShuffleSplit(n_splits=1, test_size=0.50, random_state=42)
val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["group"]))
val_df, test_df = temp_df.iloc[val_idx], temp_df.iloc[test_idx]

In [18]:
from pathlib import Path

out_dir = Path("/Users/sarvesh/Desktop/GitHub/lego-minifigure-finder/data")
out_dir.mkdir(parents=True, exist_ok=True)  # no-op if it already exists

train_df[["filename", "label"]].to_csv(out_dir / "train.csv", index=False)
val_df[["filename", "label"]].to_csv(out_dir / "val.csv", index=False)
test_df[["filename", "label"]].to_csv(out_dir / "test.csv", index=False)

print("Counts:")
print("train:", train_df["label"].value_counts().to_dict())
print("val:  ", val_df["label"].value_counts().to_dict())
print("test: ", test_df["label"].value_counts().to_dict())

Counts:
train: {0: 689, 1: 265}
val:   {0: 149, 1: 37}
test:  {0: 147, 1: 69}
