In [1]:
from pypots.data.generating import gene_physionet2012
from pypots.utils.random import set_random_seed
from pypots.optim import Adam
from pypots.imputation import GPVAE
from pypots.utils.metrics import cal_mae



In [2]:
set_random_seed()

# Load the PhysioNet-2012 dataset
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0.1)


# Assemble the datasets for training, validating, and testing.

dataset_for_training = {
    "X": physionet2012_dataset['train_X'],
}

dataset_for_validating = {
    "X": physionet2012_dataset['val_X'],
    "X_intact": physionet2012_dataset['val_X_intact'],
    "indicating_mask": physionet2012_dataset['val_X_indicating_mask'],
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
}


2023-10-25 16:59:18 [INFO]: Have set the random seed as 2204 for numpy and pytorch.
2023-10-25 16:59:18 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Database)...
2023-10-25 16:59:18 [INFO]: Starting preprocessing physionet_2012...
2023-10-25 16:59:18 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2023-10-25 16:59:18 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2023-10-25 16:59:18 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2023-10-25 16:59:18 [INFO]: Loaded successfully!


In [8]:
dataset_for_testing["X"].shape

(2398, 48, 37)

In [3]:
# initialize the model
gp_vae = GPVAE(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    latent_size=37,
    encoder_sizes=(128,128),
    decoder_sizes=(256,256),
    kernel="cauchy",
    beta=0.2,
    M=1,
    K=1,
    sigma=1.005,
    length_scale=7.0,
    kernel_scales=1,
    window_size=24,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=100,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default, PyPOTS will automatically assign the best device for you.
    # Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
    device='cuda:0',
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/gp_vae",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)



2023-10-23 16:54:37 [INFO]: Model files will be saved to tutorial_results/imputation/gp_vae\20231023_T165437
2023-10-23 16:54:37 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/gp_vae\20231023_T165437\tensorboard
2023-10-23 16:54:37 [INFO]: Model initialized successfully with the number of trainable parameters: 229,652


In [4]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
gp_vae.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
gp_vae_imputation = gp_vae.impute(dataset_for_testing)

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(gp_vae_imputation,
                      physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)

2023-10-23 16:54:47 [INFO]: epoch 0: training loss 26157.3701, validating loss 0.5261
2023-10-23 16:54:58 [INFO]: epoch 1: training loss 22874.5839, validating loss 0.5207
2023-10-23 16:55:12 [INFO]: epoch 2: training loss 22839.3164, validating loss 0.5146
2023-10-23 16:55:26 [INFO]: epoch 3: training loss 22834.6367, validating loss 0.5134
2023-10-23 16:55:42 [INFO]: epoch 4: training loss 22826.5340, validating loss 0.5045
2023-10-23 16:55:56 [INFO]: epoch 5: training loss 22825.2355, validating loss 0.4954
2023-10-23 16:56:08 [INFO]: epoch 6: training loss 22823.5394, validating loss 0.4738
2023-10-23 16:56:14 [INFO]: epoch 7: training loss 22816.4922, validating loss 0.5268
2023-10-23 16:56:20 [INFO]: epoch 8: training loss 22819.4550, validating loss 0.4629
2023-10-23 16:56:32 [INFO]: epoch 9: training loss 22809.6713, validating loss 0.4827
2023-10-23 16:56:45 [INFO]: epoch 10: training loss 22807.3872, validating loss 0.4501
2023-10-23 16:56:59 [INFO]: epoch 11: training loss 2

Testing mean absolute error: 0.4236


In [5]:
from pypots.optim import Adam
from pypots.imputation import SAITS
from pypots.utils.metrics import cal_mae

# initialize the model
saits = SAITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=2,
    d_model=256,
    d_inner=128,
    n_heads=4,
    d_k=64,
    d_v=64,
    dropout=0.1,
    attn_dropout=0.1,
    diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=100,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=None,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # Set it to None to use the default device (will use CPU if you don't have CUDA devices).
    # You can also set it to 'cpu' or 'cuda' explicitly, or ['cuda:0', 'cuda:1'] if you have multiple CUDA devices.
    device=None,
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/saits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)



2023-10-23 17:00:47 [INFO]: No given device, using default device: cuda
2023-10-23 17:00:47 [INFO]: Model files will be saved to tutorial_results/imputation/saits\20231023_T170047
2023-10-23 17:00:47 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/saits\20231023_T170047\tensorboard
2023-10-23 17:00:47 [INFO]: Model initialized successfully with the number of trainable parameters: 1,378,358
2023-10-23 17:00:52 [INFO]: epoch 0: training loss 0.7302, validating loss 0.3248
2023-10-23 17:00:58 [INFO]: epoch 1: training loss 0.5149, validating loss 0.3060
2023-10-23 17:01:03 [INFO]: epoch 2: training loss 0.4640, validating loss 0.2829
2023-10-23 17:01:08 [INFO]: epoch 3: training loss 0.4242, validating loss 0.2663
2023-10-23 17:01:14 [INFO]: epoch 4: training loss 0.3966, validating loss 0.2558
2023-10-23 17:01:19 [INFO]: epoch 5: training loss 0.3760, validating loss 0.2457
2023-10-23 17:01:24 [INFO]: epoch 6: training loss 0.3609, validating loss 0.2419
2023-10-23 

Testing mean absolute error: 0.2115


In [None]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
saits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
saits_imputation = saits.impute(dataset_for_testing)

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(
    saits_imputation, physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


In [15]:
from pypots.imputation import CSDI

csdi = CSDI(n_layers=physionet2012_dataset['n_steps'],
            n_features=physionet2012_dataset['n_features'],
            d_time_embedding=1,
            d_feature_embedding=1,
            target_strategy = "mix",
            d_diffusion_embedding = 50,
            n_diffusion_steps= 128,
            n_heads = 8,
            n_channels = 8,
            schedule = "quad",
            beta_start = 0.0001,
            beta_end = 0.5,
            is_unconditional = True,
            batch_size=32,
            # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
            epochs=100,
            device="cuda:0",
            # set the path for saving tensorboard and trained model files
            saving_path="tutorial_results/imputation/saits",
            # only save the best model after training finished.
            # You can also set it as "better" to save models performing better ever during training.
            model_saving_strategy="best",
)


2023-10-23 17:27:48 [INFO]: Model files will be saved to tutorial_results/imputation/saits\20231023_T172748
2023-10-23 17:27:48 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/saits\20231023_T172748\tensorboard
2023-10-23 17:27:48 [INFO]: Model initialized successfully with the number of trainable parameters: 176,882


In [13]:

# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
csdi.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
csdi_imputation = csdi.impute(dataset_for_testing)

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(
    csdi_imputation, physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


2023-10-23 17:16:33 [ERROR]: Exception: The size of tensor a (48) must match the size of tensor b (37) at non-singleton dimension 1


RuntimeError: Training got interrupted. Model was not trained. Please investigate the error printed above.

In [14]:
from pypots.optim import Adam
from pypots.imputation import USGAN
from pypots.utils.metrics import cal_mae

# initialize the model
us_gan = USGAN(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=256,
    lambda_mse=1,
    dropout_rate=0.1,
    G_steps=1,
    D_steps=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=100,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    G_optimizer=Adam(lr=1e-3),
    D_optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default, PyPOTS will automatically assign the best device for you.
    # Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
    device='cuda:0',
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/us_gan",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
us_gan.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
us_gan_imputation = us_gan.impute(dataset_for_testing)

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = cal_mae(us_gan_imputation,
                      physionet2012_dataset['test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
print("Testing mean absolute error: %.4f" % testing_mae)


2023-10-23 17:19:40 [INFO]: Model files will be saved to tutorial_results/imputation/us_gan\20231023_T171940
2023-10-23 17:19:40 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/us_gan\20231023_T171940\tensorboard
2023-10-23 17:19:40 [INFO]: Model initialized successfully with the number of trainable parameters: 1,258,517
2023-10-23 17:21:21 [INFO]: epoch 0: training loss_generator 4.0457, train loss_discriminator 0.1884
2023-10-23 17:23:06 [INFO]: epoch 1: training loss_generator 4.8104, train loss_discriminator 0.1214
2023-10-23 17:24:33 [INFO]: epoch 2: training loss_generator 5.2982, train loss_discriminator 0.0933
2023-10-23 17:27:23 [INFO]: epoch 3: training loss_generator 5.6700, train loss_discriminator 0.0778
2023-10-23 17:27:23 [INFO]: Exceeded the training patience. Terminating the training procedure...
2023-10-23 17:27:23 [INFO]: Finished training.
2023-10-23 17:27:23 [INFO]: Saved the model to tutorial_results/imputation/us_gan\20231023_T171940\USGAN.p

Testing mean absolute error: 0.2691


In [1]:
import pandas as pd

# Sample DataFrame
data = {'person_id': [1, 1, 2, 2, 2, 3],
        'value': ['A', 'B', 'C', 'D', 'E', 'F']}
df = pd.DataFrame(data)

# Step 1: Identify the maximum number of rows for a person
max_rows = df.groupby('person_id').size().max()

# Step 2 and 3: Create a new DataFrame with the maximum number of rows for each person
max_rows_df = (df.groupby('person_id', group_keys=False)
               .apply(lambda group: group.head(max_rows))
               .reset_index(drop=True))

# Display the result
print(max_rows_df)

   person_id value
0          1     A
1          1     B
2          2     C
3          2     D
4          2     E
5          3     F


In [3]:
import pandas as pd

# Sample DataFrame
data = {'person_id': [1, 1, 2, 2, 2, 3],
        'value': ['A', 'B', 'C', 'D', 'E', 'F']}
df = pd.DataFrame(data)

# Step 1: Identify the maximum number of rows for a person
max_rows = df.groupby('person_id').size().max()

# Step 2: Create a new DataFrame with the maximum number of rows for each person
max_rows_df = pd.DataFrame(columns=df.columns)

# Step 3: Copy data from the original DataFrame to the new DataFrame
for person_id, group in df.groupby('person_id'):
    rows_to_copy = min(len(group), max_rows)
    max_rows_df = pd.concat([max_rows_df, group.head(rows_to_copy)])

# Display the result
print(max_rows_df)

  person_id value
0         1     A
1         1     B
2         2     C
3         2     D
4         2     E
5         3     F


In [4]:
import pandas as pd

# Sample DataFrame
data = {'person_id': [1, 1, 2, 2, 2, 3],
        'value': ['A', 'B', 'C', 'D', 'E', 'F']}
df = pd.DataFrame(data)

# Step 1: Identify the maximum number of rows for a person
max_rows = df['person_id'].value_counts().max()

# Step 2 and 3: Create a new DataFrame with the maximum number of rows for each person
max_rows_df = (df.groupby('person_id')
               .apply(lambda group: group.head(max_rows))
               .reset_index(drop=True))

# Display the result
print(max_rows_df)

   person_id value
0          1     A
1          1     B
2          2     C
3          2     D
4          2     E
5          3     F


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda group: group.head(max_rows))


In [5]:
import pandas as pd

# Sample DataFrame
data = {'person_id': [1, 1, 2, 2, 2, 3],
        'value': ['A', 'B', 'C', 'D', 'E', 'F']}
df = pd.DataFrame(data)

# Identify the maximum number of rows for a person
max_rows = df['person_id'].value_counts().max()

# Create a new DataFrame with the maximum number of rows for each person
max_rows_df = pd.DataFrame()

# Iterate over each person_id and append rows until reaching the maximum
for person_id, group in df.groupby('person_id'):
    num_rows = len(group)
    if num_rows < max_rows:
        diff = max_rows - num_rows
        rows_to_add = group.iloc[:diff].copy()
        rows_to_add['value'] = None  # You can fill this with your desired value
        max_rows_df = pd.concat([max_rows_df, group, rows_to_add], ignore_index=True)
    else:
        max_rows_df = pd.concat([max_rows_df, group], ignore_index=True)

# Display the result
print(max_rows_df)

   person_id value
0          1     A
1          1     B
2          1  None
3          2     C
4          2     D
5          2     E
6          3     F
7          3  None


In [11]:
# m = len(max(dfs, key=len))
values= df.reindex(range(10), fill_value=0)

In [12]:
values

Unnamed: 0,person_id,value
0,1,A
1,1,B
2,2,C
3,2,D
4,2,E
5,3,F
6,0,0
7,0,0
8,0,0
9,0,0


In [None]:

np.reshape(train_dataset.amputated_values.values, (-1, train_dataset.maxlen, train_dataset.features_df.shape[1]))

In [15]:
import numpy as np

In [21]:
# Creating an array with 12 elements
arr = np.arange(2016)

# Reshaping the array to a 2x6 matrix
reshaped_arr = arr.reshape(-1, 168,6)

print(reshaped_arr)

[[[   0    1    2    3    4    5]
  [   6    7    8    9   10   11]
  [  12   13   14   15   16   17]
  ...
  [ 990  991  992  993  994  995]
  [ 996  997  998  999 1000 1001]
  [1002 1003 1004 1005 1006 1007]]

 [[1008 1009 1010 1011 1012 1013]
  [1014 1015 1016 1017 1018 1019]
  [1020 1021 1022 1023 1024 1025]
  ...
  [1998 1999 2000 2001 2002 2003]
  [2004 2005 2006 2007 2008 2009]
  [2010 2011 2012 2013 2014 2015]]]


In [22]:
df = pd.DataFrame({"id": [100, 200, 200, 300, 300, 300], "val1": [1.5, 2.5, 4.5, np.nan, 6.5, np.nan], "val2": [9.5, 7.5, 8.5, 3.5, np.nan, np.nan]})

In [23]:
df

Unnamed: 0,id,val1,val2
0,100,1.5,9.5
1,200,2.5,7.5
2,200,4.5,8.5
3,300,,3.5
4,300,6.5,
5,300,,


In [35]:
df['new']=df.groupby('id').cumcount()
df_true=df.set_index(['id','new']).unstack(fill_value=0).stack(dropna=False).reset_index('id').set_index('id')
df_true

Unnamed: 0_level_0,val1,val2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
100,1.5,9.5
100,0.0,0.0
100,0.0,0.0
200,2.5,7.5
200,4.5,8.5
200,0.0,0.0
300,,3.5
300,6.5,
300,,


In [30]:
df_true.reset_index(inplace=True,drop=True)

In [32]:
df_true.set_index("id")

Unnamed: 0_level_0,val1,val2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
100,1.5,9.5
100,0.0,0.0
100,0.0,0.0
200,2.5,7.5
200,4.5,8.5
200,0.0,0.0
300,,3.5
300,6.5,
300,,
