In [56]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from tqdm import tqdm
import re
import os
import sys
cur_dir = os.path.dirname(os.path.abspath("__file__"))  # Gets the current notebook directory
src_dir = os.path.join(cur_dir, '../')  # Constructs the path to the 'src' directory
if src_dir not in sys.path:
    sys.path.append(src_dir)
    
from src.constant import sidewalks, stations

In [57]:
def process_df(base_name):

    input = pd.read_csv(f"../data/PredictionModelOutput/{base_name}_ModelInputs.csv")
    # output = pd.read_csv(f"../data/PredictionModelOutput/{base_name}_ModelOutputs.csv")
    features = pd.read_csv(f"../data/FeatureGeneratorOutput/{base_name}.csv")
    state = pd.read_csv(f"../data/FAMOutput/{base_name}_States.csv")

    input.columns = [c.strip().replace(' ', '_') for c in input.columns]
    # output.columns = [c.strip().replace(' ', '_') for c in output.columns]
    features.columns = [c.strip().replace(' ', '_') for c in features.columns]

    input = input[['Timestamp', 'User_X', 'User_Y', 'AGV_X', 'AGV_Y']]

    features = features[features['Phase2_scenario_num'] != 1]
    features['scenario'] = features['Phase2_scenario_num'] - 1

    features.drop(columns=['Phase1_scenario_num', 'Phase2_scenario_num'], inplace=True)

    df = pd.merge(features,state, on='Timestamp', how='left')

    # Function to fix timestamp format
    def fix_timestamp_format(ts):
        match = re.match(r"(.*:)(\d+)$", ts)
        if match:
            prefix, last_part = match.groups()
            if len(last_part) == 4:
                fixed_last_part = last_part + "0"  # Append 0 if it's 4 digits
            else:
                fixed_last_part = last_part  # Keep as is if already 5 digits
            return prefix + fixed_last_part
        return ts  # Return as is if no match

    df["Timestamp"] = df["Timestamp"].apply(fix_timestamp_format)
    df.reset_index(drop=True, inplace=True)
    df['Timestamp'] = df.index
    return df



In [59]:
base_files = set()
for file_name in os.listdir("../data/PredictionModelOutput"):
    if file_name.endswith(".csv"):
        base_files.add(file_name.strip(".csv").split("_Model")[0])
base_files = list(base_files)

for base_name in tqdm(sorted(base_files), desc="Processing files"):
    user = base_name.split("_")[0]
    type = re.search(r'(Control|eHMI_pred|pred_only)', base_name).group(0)
    data = process_df(base_name)
    data['User'] = user
    data['Type'] = type
    data.to_pickle(f"../data/Phase3/Modified/{base_name}.pkl")



Processing files: 100%|██████████| 100/100 [00:03<00:00, 31.77it/s]
