## Preprocessing data
Run this script once to reproduce the processed_data directory that will be later loaded to disk in the model notebooks.

In [1]:
from datasets import load_dataset

dataset = load_dataset("huggan/wikiart", split="train")

Found cached dataset parquet (/home/sh4230/.cache/huggingface/datasets/huggan___parquet/huggan--wikiart-f80281a55521ea74/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


Save mappings from numeric class to string

In [7]:
features = dataset.features
style_label_mapping = features["style"].int2str

Found cached dataset parquet (/home/sh4230/.cache/huggingface/datasets/huggan___parquet/huggan--wikiart-f80281a55521ea74/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


In [8]:
num_to_style = {}
for label in range(len(features["style"].names)):
    num_to_style[int(label)] = features['style'].int2str(label)
with open("num_to_style.json", "w") as f:
    json.dump(num_to_style, f)

Process data to 224x224 (for ResNet50 and ViT input) and save to disk

In [None]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def preprocess_function(batch):
    pixels = [transform(image) for image in batch["image"]]
    return {"image": pixels, "label": batch["style"]}

processed_dataset = dataset.map(preprocess_function, batched=True, batch_size=32, num_proc=4)

In [None]:
processed_dataset.save_to_disk("processed_data")

The following cells create a subset of 20000 entries, ensures a balanced dataset, and saves to disk.

In [None]:
labels = [processed_dataset[i]["label"] for i in range(len(processed_dataset))]

In [None]:
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
import torch
import os

subset_size = 20000
subset_indices, _ = train_test_split(
    list(range(len(processed_dataset))),
    stratify=labels,
    test_size=(len(processed_dataset) - subset_size),
    random_state=42,
)

subset = Subset(processed_dataset, subset_indices)
subset_labels = [labels[i] for i in subset_indices]

train_indices, temp_indices, train_labels, temp_labels = train_test_split(
    subset_indices,
    subset_labels,
    stratify=subset_labels,
    test_size=0.2,
    random_state=42,
)

val_indices, test_indices, _, _ = train_test_split(
    temp_indices,
    temp_labels,
    stratify=temp_labels,
    test_size=0.5,
    random_state=42,
)

train_dataset = Subset(processed_dataset, train_indices)
val_dataset = Subset(processed_dataset, val_indices)
test_dataset = Subset(processed_dataset, test_indices)

base_path = "./data"
os.makedirs(base_path, exist_ok=True)
torch.save(train_dataset, os.path.join(base_path, "train.pt"))
torch.save(val_dataset, os.path.join(base_path, "val.pt"))
torch.save(test_dataset, os.path.join(base_path, "test.pt"))