In [1]:
import numpy as np
import scipy.io as sio
import pickle
import os

In [2]:
# Load the Schools dataset
schools_name = "data/school/schoolData.mat"
schools_dataset = sio.loadmat(schools_name)
print(schools_dataset.keys())
X_schools = schools_dataset['X']
X_schools = np.squeeze(X_schools)
Y_schools = schools_dataset['Y']
Y_schools = np.squeeze(Y_schools)
print(X_schools.shape)
print(X_schools[0].shape)
print(Y_schools.shape)
print(Y_schools[0].shape)

T = X_schools.shape[0]
d1 = X_schools[0].shape[0]
print("T = {}".format(T))
print("d1 = {}".format(d1))

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Y'])
(139,)
(26, 200)
(139,)
(200, 1)
T = 139
d1 = 26


In [3]:
# Extract 10 meta-test sets of varying sizes (from 20 to 200)
samples_per_task = {}
for i in range(T):
    for k in range(20, 220, 20):
        if (k <= X_schools[i].shape[1] < k + 10 and k not in samples_per_task):
            samples_per_task[k] = i

# Key: number of samples for test set
# Value: the i'th task that contains that number of samples
samples_per_task = dict(sorted(samples_per_task.items()))

In [4]:
def train_test_split(task, X_schools, Y_schools):
    test_task = X_schools[task]
    train_tasks = np.delete(X_schools, task)
    test_labels = Y_schools[task]
    train_labels = np.delete(Y_schools, task)
    return train_tasks, test_task, train_labels, test_labels

def transform_Y(X):
    d2 = 1000
    sigma = 100

    # Generate the parameters for the transform
    sum_phi = np.zeros(d2)
    v = np.random.uniform(0, 2 * np.pi, size=d2)
    U = np.random.normal(0, sigma, size=(d2, X.shape[1]))
    for i in range(X.shape[0]):
        sum_phi += np.sqrt(2/d2) * np.cos(U @ X[i] + v)
    avg_phi = np.divide(sum_phi, X.shape[0])
    return avg_phi

def generate_X_Y(task):
    X = np.transpose(task)
    Y = transform_Y(X)

    return X, Y

In [5]:
# Create a training set by separating out the meta-test set and keeping the rest
parent_dir = "data/school/"
N = 2000
for n_t in samples_per_task:
    train_tasks, test_task, train_labels, test_labels = train_test_split(samples_per_task[n_t], X_schools, Y_schools)

    # Get the meta-test task and save it
    #TODO: Y should be T x d2, NOT N x d2
    X0, Y0 = generate_X_Y(test_task)
    R0 = test_labels
    path = os.path.join(parent_dir, "N2_{}/".format(n_t))
    try:
        os.mkdir(path)
    except OSError as error:
        print(error)
    pickle.dump(X0, open(path + "X0", "wb"))
    pickle.dump(Y0, open(path + "Y0", "wb"))
    pickle.dump(R0, open(path + "R0", "wb"))

    # Get the rest of the training data and save all of it
    d1 = test_task.shape[0]
    d2 = 1000
    X_full = np.ones((1, d1))
    Y_full = np.ones((1, d2))
    R_full = np.ones((1, 1))

    # For each remaining task (138, in this case)
    task_function = []
    index_total = 0
    for i in range(train_tasks.shape[0]):
        train_task = train_tasks[i]
        train_label = train_labels[i]
        X, Y = generate_X_Y(train_task)
        R = train_label
        X_full = np.vstack((X_full, X))
        Y_full = np.vstack((Y_full, Y))
        R_full = np.vstack((R_full, R))

        prev_index_total = index_total
        index_total += X.shape[0]
        for j in range(prev_index_total, index_total):
            task_function.append(i)

        # Record the task mapping for each sample in this task
        if (X_full.shape[0] > N):
            break

    task_function = np.asarray(task_function)
    R_full = R_full[1:]     # N x 1
    X_full = X_full[1:]     # N x d1
    Y_full = Y_full[1:]     # T x d2
    pickle.dump(X_full, open(path + "X.pkl", "wb"))
    pickle.dump(Y_full, open(path + "Y.pkl", "wb"))
    pickle.dump(R_full, open(path + "R.pkl", "wb"))
    pickle.dump(task_function, open(path + "task_function.pkl", "wb"))

    break