In [1]:
import numpy as np

In [2]:
sup_data = np.load("data/train/sup_data.npz")

In [11]:
def reshape_images(images, reshape_dims=(200, 200, 1)):
    """Creates additional dimension for the 2D image."""
    return np.array([image.reshape(reshape_dims) for image in images])

def separate_dataset(data, labels):
    """Divides the data and the corresponding labels into 3 separate datasets, each one containing just a single phase"""
    phases = ["ferromagnet", "skyrmion", "spiral"]
    separated_dataset = {}

    for label, phase in enumerate(phases): # 0-ferromagnet, 1-skyrmion, 2-spiral
        idxs = sup_data["labels"] == label 
        separated_dataset[phase] = (data[idxs], labels[idxs])
    
    return separated_dataset

def shuffle_dataset(data, labels, random_seed=42):
    """Suffles the samples in the dataset"""
    np.random.seed(random_seed)
    indices = np.random.permutation(len(data))
    return data[indices], labels[indices]


def split_dataset(data, labels, train_ratio, dev_ratio, random_seed=42):
    """Splits the data into train, dev, and test sets."""

    data, labels = shuffle_dataset(data, labels, random_seed)
    
    train_stop = int(len(data) * train_ratio)
    dev_stop = train_stop + int(len(data) * dev_ratio)

    data_dict = {
        "train": {"images": data[:train_stop], "labels": labels[:train_stop]},
        "dev": {"images": data[train_stop:dev_stop], "labels": labels[train_stop:dev_stop]},
        "test": {"images": data[dev_stop:], "labels": labels[dev_stop:]},
    }
    
    return data_dict

def train_dev_test_split(data, labels, train_ratio=0.8, dev_ratio=0.1):
    """
    Splits data and labels into train, dev, and test sets by aggregating results 
    from all classes in the dataset.
    """
    # Initialize combined dataset structure using dictionary comprehensions
    combined_splits = {split: {"images": [], "labels": []} for split in ["train", "dev", "test"]}
    
    # Iterate over separated data and combine splits
    for subset_data, subset_labels in separate_dataset(data, labels).values():
        splits = split_dataset(subset_data, subset_labels, train_ratio, dev_ratio)
        for split in combined_splits: # train, dev, test
            for key in combined_splits[split]: # images, labels
                combined_splits[split][key].extend(splits[split][key])
    
    return combined_splits


def save_dataset(filename, train, dev, test):
    """Saves the dataset into a `.npz` file."""
    np.savez(
        filename,
        train_images=train["images"], train_labels=train["labels"],
        dev_images=dev["images"], dev_labels=dev["labels"],
        test_images=test["images"], test_labels=test["labels"]
    )

In [17]:
data = preprocess_images(sup_data["data"])
labels = sup_data["labels"]

train, dev, test = train_dev_test_split(data, labels, train_ratio=0.8, dev_ratio=0.1).values()

In [None]:


data = preprocess_images(sup_data["data"])
labels = sup_data["labels"]

x_train, y_train, x_dev, y_dev, x_test, y_test = train_dev_test_split(data, labels, train_ratio=0.8, dev_ratio=0.1)

# Save to file
save_dataset(
    'data/train/skyrmion_dataset2',
    (x_train, y_train), (x_dev, y_dev), (x_test, y_test)
)


In [29]:
sup_data["data"][0].shape

(200, 200)