Generate a synthetic dataset for demonstrating LVM

Setup

```mamba create -n lvm
mamba activate lvm
mamba install ipykernel pandas
ipython kernel install --user --name=lvm

https://developer.nvidia.com/cuda-downloads

mamba install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
mamba install lightning -c conda-forge
```

In [13]:
import pandas as pd
import pdb
import pickle
import numpy as np
import os
import sys, math, copy
import time
import warnings
from typing import Tuple
from tempfile import TemporaryFile
import numpy as np
import numpy.random as npr

In [14]:
import torch
cuda_avail = torch.cuda.is_available()
print("CUDA available: " + str(cuda_avail))
device = torch.device("cuda:0" if cuda_avail else "cpu")

CUDA available: True


In [15]:
dataset_name = "synthetic1"
data_dir = "data"
data_dtype = torch.float32
randomseed = 0
npr.seed(randomseed)
num_histories = 1000

num_demo_features = 10  # demographic features - random binary
num_var_features = 10   # variable features - random binary

mean_start_time = 2010  #when the history starts for a given ID
index_time_val = num_demo_features + num_var_features #the location of the time feature

max_seq_len = 50 # the length of history, i.e. transformer context window
min_seq_len = 2  # for next token prediction, we need at least 1 step + 1 to predict
mean_seq_len = 40

token_vals = 10  # number of possible tokens, i.e. categories, aka vocab_size
index_token_val = num_demo_features + num_var_features + 1

padding_val = token_vals  # padding for both token and vector features (+1 not needed)
#torch.nan is another option, but it has no value in long(), and cannot be used in any inputs ...


In [16]:
# Generate a random tensor with the specified dimensions
random_tensor = torch.randint(0, 2, 
                              (num_histories, 
                               max_seq_len, 
                               num_demo_features + num_var_features \
                                + 1 #time
                                + 1 #token
                               ), 
                               dtype=data_dtype,
                               device=device)

In [17]:
for i in range(random_tensor.shape[0]):    
    # fix the demographic features by copying the first value over sequence length
    random_tensor[i, :, 0:num_demo_features] = random_tensor[i, 0, 0:num_demo_features].repeat(max_seq_len, 1)

In [18]:
# insert the start time by taking a random poisson value with mean mean_start_time  
random_tensor[:, 0, index_time_val] = torch.from_numpy(np.random.poisson(mean_start_time, num_histories)).to(device)

for i in range(random_tensor.shape[0]):    
    # increment the time by 1 for each subsequent time step
    random_tensor[i, :, index_time_val] = torch.arange(max_seq_len, dtype=data_dtype, device=device) + random_tensor[i, 0, index_time_val]


In [19]:
# The categorical feature (token) is a random integer between 0 and token_vals
# Toy case: category is sum the existing features weighted by lag

for j in range(random_tensor.shape[1]):
    for j2 in range(max(j-5,0), j+1):
        random_tensor[:, j, index_token_val] += (1.0/(j-j2+1.0)) * torch.sum(random_tensor[:, j2, 0:num_demo_features + num_var_features], dim=1)

In [20]:
# wishlist: resolve - test does not always pass
# print(random_tensor[10, 0, 0:num_demo_features + num_var_features].sum())
# print(random_tensor[10, 1, 0:num_demo_features + num_var_features].sum())
# print(random_tensor[10, 2, 0:num_demo_features + num_var_features].sum())
# print(random_tensor[10, 3, 0:num_demo_features + num_var_features].sum())
# print(random_tensor[10, 4, 0:num_demo_features + num_var_features].sum())

# print(random_tensor[10, 0, 0:num_demo_features + num_var_features].sum()/5 + 
#     random_tensor[10, 1, 0:num_demo_features + num_var_features].sum()/4 +
#     random_tensor[10, 2, 0:num_demo_features + num_var_features].sum()/3 +
#     random_tensor[10, 3, 0:num_demo_features + num_var_features].sum()/2 +
#     random_tensor[10, 4, 0:num_demo_features + num_var_features].sum())

# print(random_tensor[10, 4, index_token_val])

In [21]:
# compute the token by rounding the token_val and clipping it to the max token value
for i in range(random_tensor.shape[0]):   
    random_tensor[i, :, index_token_val] = torch.floor(random_tensor[i, :, index_token_val] % token_vals)

In [22]:
# censor the data series by fixing to padding_val after a poison random time according to the mean_seq_len
seq_lengths = np.max([np.random.poisson(mean_seq_len, num_histories), np.ones(num_histories)*min_seq_len], axis=0).astype(int)
seq_lengths = np.min([seq_lengths, np.ones(num_histories)*max_seq_len], axis=0).astype(int)

for i in range(random_tensor.shape[0]):   
    random_tensor[i, seq_lengths[i]:, :] = torch.full((max_seq_len - seq_lengths[i], random_tensor.shape[2]), padding_val)

In [23]:
print(seq_lengths[10])
print(random_tensor[10, 0:seq_lengths[10]+1, index_token_val])

49
tensor([ 9.,  3.,  6.,  0.,  4.,  4.,  4.,  3.,  5.,  8.,  6.,  6.,  7.,  8.,
         4.,  4.,  8.,  8.,  6.,  4.,  3.,  5.,  5.,  8.,  3.,  7.,  6.,  6.,
         4.,  2.,  4.,  6.,  3.,  4.,  2.,  6.,  4.,  4.,  3.,  3.,  1.,  3.,
         2.,  1.,  3.,  2.,  4.,  3.,  3., 10.], device='cuda:0')


In [24]:
# save the data to a file
fpath = os.path.join(data_dir, dataset_name) + f"seed={randomseed}_n={num_histories}.pkl"
with open(fpath, 'wb') as f:
    pickle.dump(random_tensor, f)
print(fpath)

data\synthetic1seed=0_n=1000.pkl
