# Prepare data for the real-time pipeline simulator

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import os
import sys
cur_dir = os.path.dirname(os.path.abspath("__file__"))  # Gets the current notebook directory
src_dir = os.path.join(cur_dir, '../')  # Constructs the path to the 'src' directory
# Add the 'src' directory to sys.path
if src_dir not in sys.path:
    sys.path.append(src_dir)

from tqdm.notebook import tqdm
from src.MyDataset import MyDataset

In [2]:
import torch.utils
import torch.utils.data


lookback = 30
future_steps = 40
dir = '../data/PandasData/Sampled/'
ds = MyDataset(lookback=lookback)
train_batch_size = 16
test_batch_size = 64

def process_data(df_dir : str, target_freq : int = 10):
    df: pd.DataFrame = pd.read_pickle(df_dir)
    
    if 'state' in df.columns:
        states_ohe = pd.get_dummies(df['state'], prefix='state')
        df = pd.concat([df, states_ohe], axis=1)
        df.drop(columns=['state'], inplace=True)
        
    df.dropna(inplace=True, how='any')
    f_per_sec = df.groupby('TimestampID').count().mean().mean()
    if f_per_sec < target_freq:
        raise ValueError('The frequency of the data is lower than the target frequency')
    elif int(f_per_sec) == target_freq:
        pass
    else:
        resample_ratio = int(f_per_sec/target_freq)
        df = df.iloc[::resample_ratio, :]
    # # for origin
    for drop_column in ['Confidence', 'Timestamp', 'TimestampID', 
                          'DatapointID', 'PID', 'SCN', 'U_X', 'U_Y', 'U_Z', 
                          'AGV_Z', 'User_Z', 'GazeOrigin_Z', 'User_Pitch', 'User_Yaw', 'User_Roll', 
                          'EyeTarget']:
        df = df.drop(columns=[drop_column], errors='ignore')

    target_columns = ['User_X', 'User_Y']
    # Reorder columns
    new_columns = target_columns + [col for col in df.columns if col not in target_columns]
    df = df[new_columns]

    return df

for file in os.listdir(dir):
    if file.endswith('.pkl'):
        df = process_data(dir+file)
        ds.read_data(df)


In [3]:
# stats_dict = {'mean': 0, 'std': 0, 'min': 0, 'max': 0}
# stats_dict = ds.normalize_dataset()

for i, data in enumerate(ds.dataset):
    data.to_csv('demo/{}.csv'.format(i), index=False)
    if i > 10:
        break