## Create the simulation data and split it into test, val, train.

In [1]:
import pandas as pd
import numpy as np
import torch
print("PyTorch Version: ",torch.__version__)
from torch.utils.data import TensorDataset

PyTorch Version:  1.11.0


  from .autonotebook import tqdm as notebook_tqdm


This notebook shows how to simulate 20 modalities as described by the paper. The first classification label is created by 20 random values that add up to 1. The second classification label is created by randomly selecting numbers that are all under 0.15 - this is done so that without inspecting all values together, it is difficult to tell which label a value belongs to. For example, 0.14 is less than 0.15, but it could also be a value that adds to 1. Each value is then vectorized by sampling randomly around the chosen number, such that a modality is a vector rather than a single number.

In [None]:
df = pd.DataFrame(columns = list(range(20)))
for i in range(1000):
    l = np.random.dirichlet(np.ones(20),size=1)[0]
    l2 = np.random.uniform(0,0.15,20)
    std1 = l.std()
    std2 = l2.std()
    arr = []
    arr2 = []
    for j in range(len(l2)):
        arr.append(np.random.uniform(l[j] - std1 ,l[j]  + std1,20))
        arr2.append(np.random.uniform(l2[j] - std2 ,l2[j]  + std2,20))
    
    df_temp = pd.DataFrame([arr, arr2])
    df = df.append(df_temp)

In [3]:
df = df.reset_index()
df["label"] = df["index"]
df = df.drop(["index"], axis=1)
cols = list(set(df.columns) - set(["label"]))

In [12]:
test = df.sample(n=200)
train = df.drop(test.index)
val = train.sample(n=200)
train = train.drop(val.index)

In [16]:
y_train, y_test, y_val = train["label"], test["label"], val["label"]
X_train, X_test, X_val = train.drop("label", axis=1), test.drop("label", axis=1), val.drop("label", axis=1)

In [133]:
path = ".../simulation_data_vectors/"

for i in range(20):
    train_inputs = TensorDataset(torch.Tensor(np.array(list(X_train[i].values))), torch.Tensor(y_train.astype(int).values))
    val_inputs = TensorDataset(torch.Tensor(np.array(list(X_val[i].values))), torch.Tensor(y_val.astype(int).values))
    test_inputs = TensorDataset(torch.Tensor(np.array(list(X_test[i].values))), torch.Tensor(y_test.astype(int).values))

    torch.save(train_inputs, path + "train_modality_" + str(i) +  "_inputs.pt")
    torch.save(val_inputs, path + "val_modality_" + str(i) +  "_inputs.pt")
    torch.save(test_inputs, path + "test_modality_" + str(i) +  "_inputs.pt")

In [115]:
X_train.to_pickle(path + "/X_train.pkl")
y_train.to_pickle(path + "/y_train.pkl")
X_test.to_pickle(path + "/X_test.pkl")
y_test.to_pickle(path + "/y_test.pkl")
X_val.to_pickle(path + "/X_val.pkl")
y_val.to_pickle(path + "/y_val.pkl")