In [1]:
import os
import json
import pandas as pd
from typing import List, Tuple
from copy import deepcopy

In [2]:
def read_tenant_data(tenant_id: str) -> pd.DataFrame:
    t_data = []
    t_files = sorted(os.listdir(f'./data/{tenant_id}'))
    for t1_file in t_files:
        t_data.append(pd.read_csv(f'./data/{tenant_id}/{t1_file}'))

    return t_data



In [3]:
def is_same_timesteps_across_series(t_data: List[pd.DataFrame]) -> bool:
    first_series = t_data[0]['datetime']
    rest = t_data[1:]

    for series in rest:
        if not first_series.equals(series['datetime']):
            return False
    return True

In [4]:
def assign_id_per_series(t_data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    for i, series in enumerate(t_data):
        series.insert(0, 'queue', i+1)
    return t_data


In [5]:
t1 = read_tenant_data('t1')
t2 = read_tenant_data('t2')
t3 = read_tenant_data('t3')
t4 = read_tenant_data('t4')

In [227]:
is_same_timesteps_across_series(t1), is_same_timesteps_across_series(t2), is_same_timesteps_across_series(t3), is_same_timesteps_across_series(t4)

(True, True, True, True)

In [9]:
t1 = assign_id_per_series(t1)
t2 = assign_id_per_series(t2)
t3 = assign_id_per_series(t3)
t4 = assign_id_per_series(t4)

In [10]:
t1 = pd.concat(t1)
t2 = pd.concat(t2)
t3 = pd.concat(t3)
t4 = pd.concat(t4)

In [13]:
output_dir = f'./../../processed/'

t1_dir = f'{output_dir}/aht_tenant_1_ratio_max'
t2_dir = f'{output_dir}/aht_tenant_2_ratio_max'
t3_dir = f'{output_dir}/aht_tenant_3_ratio_max'
t4_dir = f'{output_dir}/aht_tenant_4_ratio_max'

os.makedirs(t1_dir, exist_ok=True)
os.makedirs(t2_dir, exist_ok=True)
os.makedirs(t3_dir, exist_ok=True)
os.makedirs(t4_dir, exist_ok=True)

t1.to_csv(f'{t1_dir}/aht_tenant_1_ratio_max.csv', index=False)
t2.to_csv(f'{t2_dir}/aht_tenant_2_ratio_max.csv', index=False)
t3.to_csv(f'{t3_dir}/aht_tenant_3_ratio_max.csv', index=False)
t4.to_csv(f'{t4_dir}/aht_tenant_4_ratio_max.csv', index=False)

In [None]:
# def split_with_ratio(df_data: List[pd.DataFrame], ratio: int='max') -> Tuple[pd.DataFrame, pd.DataFrame]:
#     df_data_copy = deepcopy(df_data)
#     all_train_data, all_test_data = [], []
#     forecast_horizon = 4 * 24 * 7 # 4 readings per hour, 24 hours per day, 7 days per week
#     for series in df_data_copy:
#         test_data = series.iloc[-forecast_horizon:]
#         train_data = series.iloc[:-forecast_horizon]

#         if ratio != 'max':
#             train_len = ratio * forecast_horizon
#             train_data = train_data.iloc[-train_len:]

#         all_train_data.append(train_data)
#         all_test_data.append(test_data)

#     return pd.concat(all_train_data), pd.concat(all_test_data)


In [None]:
# schema = {
#   "title": "TBD",
#   "description": "TBD",
#   "modelCategory": "forecasting",
#   "schemaVersion": 1.0,
#   "inputDataFormat": "CSV",
#   "encoding": "utf-8",
#   "frequency": "OTHER",
#   "forecastLength": 672,
#   "idField": {
#     "name": "queue",
#     "description": "TBD"
#   },
#   "timeField": {
#     "name": "datetime",
#     "description": "Date and time of the observation.",
#     "dataType": "DATETIME",
#     "example": "2023-11-04 00:00:00"
#   },
#   "forecastTarget": {
#     "name": "target",
#     "description": "TBD",
#     "dataType": "NUMERIC",
#     "example": 5.0
#   },
#   "pastCovariates": [],
#   "futureCovariates": ['covariate1', 'covariate2', 'covariate3'],
#   "staticCovariates": []
# }


In [None]:
# ratios = [2, 4, 6, 8, 10, 'max']
# for i, tenant in enumerate([t1, t2, t3, t4]):
#     for ratio in ratios:
#         save_dir_path = f'./data/processed_data/aht_tenant_{i+1}_ratio_{ratio}'
#         train_file_name = f'aht_tenant_{i+1}_ratio_{ratio}_train.csv'
#         test_file_name = f'aht_tenant_{i+1}_ratio_{ratio}_test.csv'
#         test_key_file_name = f'aht_tenant_{i+1}_ratio_{ratio}_test_key.csv'
#         train_fpath = os.path.join(save_dir_path, train_file_name)
#         test_fpath = os.path.join(save_dir_path, test_file_name)
#         test_key_fpath = os.path.join(save_dir_path, test_key_file_name)
#         schema_fpath = os.path.join(save_dir_path, f'aht_tenant_{i+1}_ratio_{ratio}_schema.json')
#         os.makedirs(save_dir_path, exist_ok=True)

#         train_data, test_data = split_with_ratio(tenant, ratio)
#         test_target_values = test_data['target']
#         test_key_data = test_data.copy()
#         test_data.drop(columns=['target'], inplace=True)
#         test_key_data.drop(columns=['covariate1', 'covariate2', 'covariate3'], inplace=True)

#         train_data.to_csv(train_fpath, index=False)
#         test_data.to_csv(test_fpath, index=False)
#         test_key_data.to_csv(test_key_fpath, index=False)

#         with open(schema_fpath, 'w') as f:
#             schema['title'] = f'aht_tenant_{i+1}_ratio_{ratio}'
#             json.dump(schema, f, indent=4)
            
        
        
