In [1]:
import numpy as np

In [2]:
sup_data = np.load("data/train/sup_data.npz")

In [18]:
def preprocess_images(data, scale=255, reshape_dims=(200, 200, 1)):
    """Scales and reshapes image data."""
    data = data * scale
    return np.array([image.reshape(reshape_dims) for image in data])

def separate_dataset(data, labels):
    """Divides the data and the corresponding labels into 3 separate datasets, each one containing just a single phase"""
    phases = ["fe", "sk", "sp"]
    data = sup_data["data"]
    labels = sup_data["labels"]
    separated_dataset = {}

    for label, phase in enumerate(phases): # 0-feromagnet, 1-skyrmion, 2-spiral
        idxs = sup_data["labels"] == label 
        separated_dataset[phase] = (data[idxs], labels[idxs])
    
    return separated_dataset


def shuffle_and_split(data, labels, train_ratio, dev_ratio, random_seed=42):
    """Shuffles and splits the data into train, dev, and test sets."""
    np.random.seed(random_seed)
    indices = np.random.permutation(len(data))
    data, labels = data[indices], labels[indices]
    
    train_stop = int(len(data) * train_ratio)
    dev_stop = train_stop + int(len(data) * dev_ratio)
    
    train_data, train_labels = data[:train_stop], labels[:train_stop]
    dev_data, dev_labels = data[train_stop:dev_stop], labels[train_stop:dev_stop]
    test_data, test_labels = data[dev_stop:], labels[dev_stop:]
    
    return (train_data, train_labels), (dev_data, dev_labels), (test_data, test_labels)

def train_dev_test_split(data, labels, train_ratio=0.8, dev_ratio=0.1):

    for key, value in separate_dataset(data, labels).items():
        train, dev, test = shuffle_and_split(*value, train_ratio, dev_ratio)
        #TODO


def save_dataset(filename, train_data, dev_data, test_data):
    """Saves the dataset into a `.npz` file."""
    np.savez(
        filename,
        train_images=train_data[0], train_labels=train_data[1],
        dev_images=dev_data[0], dev_labels=dev_data[1],
        test_images=test_data[0], test_labels=test_data[1]
    )

In [None]:


data = preprocess_images(sup_data["data"])
labels = sup_data["labels"]

separated_dataset = separate_dataset(data, labels)

# Shuffle and split each category
train_fe, dev_fe, test_fe = shuffle_and_split(*separated_dataset["fe"], train_ratio, dev_ratio)
train_sk, dev_sk, test_sk = shuffle_and_split(*separated_dataset["sk"], train_ratio, dev_ratio)
train_sp, dev_sp, test_sp = shuffle_and_split(*separated_dataset["sp"], train_ratio, dev_ratio)

# Combine train, dev, and test sets
train_images = np.vstack([train_fe[0], train_sk[0], train_sp[0]])
train_labels = np.hstack([train_fe[1], train_sk[1], train_sp[1]])

dev_images = np.vstack([dev_fe[0], dev_sk[0], dev_sp[0]])
dev_labels = np.hstack([dev_fe[1], dev_sk[1], dev_sp[1]])

test_images = np.vstack([test_fe[0], test_sk[0], test_sp[0]])
test_labels = np.hstack([test_fe[1], test_sk[1], test_sp[1]])

# Shuffle the combined sets
train_indices = np.random.permutation(len(train_images))
dev_indices = np.random.permutation(len(dev_images))
test_indices = np.random.permutation(len(test_images))

train_images, train_labels = train_images[train_indices], train_labels[train_indices]
dev_images, dev_labels = dev_images[dev_indices], dev_labels[dev_indices]
test_images, test_labels = test_images[test_indices], test_labels[test_indices]

# Preprocess images (scale and reshape)
train_images = preprocess_images(train_images)
dev_images = preprocess_images(dev_images)
test_images = preprocess_images(test_images)

# Save to file
save_dataset(
    'data/train/skyrmion_dataset2',
    (train_images, train_labels),
    (dev_images, dev_labels),
    (test_images, test_labels)
)
