In [1]:
# dataloader
import os
from lib.env import DATA_PATH
import json
import pandas as pd
from lib.utils import *
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
def write_regions_for_training_project(training_project):
    training_project_path = f"{DATA_PATH}/0_raw/{training_project}"
    recordings = sorted(os.listdir(training_project_path))
    regions_path = f'{DATA_PATH}/1_labeled/{training_project}/regions.json'
    with open(regions_path, 'r') as f:
        regions = json.load(f)

    dfs = []
    for recording in recordings:
        if not os.path.exists(f'{DATA_PATH}/0_raw/{training_project}/{recording}/acceleration.csv'):
            continue
        df = pd.read_csv(f'{DATA_PATH}/0_raw/{training_project}/{recording}/acceleration.csv',skiprows=1)
        if len(df) == 0:
            continue
        recording = recording.replace(f'_', '-')
        year, month, day, hour, minute, second = [int(item) for item in recording.split('-')]
        df.timestamp = df.timestamp - df.timestamp[0]
        df.timestamp = df.timestamp + (datetime_to_epoch(year, month, day, hour, minute, second)) # 4 for utc correction
        df.timestamp = df.timestamp.astype('datetime64[ns]')
        dfs.append(df)

    df = pd.concat(dfs)
    df = df.sort_values(by='timestamp')

    if not os.path.exists(f'{DATA_PATH}/2_regions/{training_project}'):
        os.makedirs(f'{DATA_PATH}/2_regions/{training_project}')

    for i,region in enumerate(regions):
        df.set_index('timestamp').loc[region['start']:region['end']].to_csv(f'{DATA_PATH}/2_regions/{training_project}/{i}.csv')

In [3]:
for training_project in ['tj_phase1','tj_phase2','ashlin_phase1','ashlin_phase2']:
    write_regions_for_training_project(training_project)

In [None]:
# projects = os.listdir(f'{DATA_PATH}/2_regions')
projects = ['tj_phase1','tj_phase2','ashlin_phase1','ashlin_phase2']
for project in projects:
    windowsize = 5000
    stride = 100
    balance = True

    labels_path = f'{DATA_PATH}/1_labeled/{project}/labels.json'
    with open(labels_path, 'r') as f:
        labels = json.load(f)
    regions = os.listdir(f'{DATA_PATH}/2_regions/{project}')

    X_train = []
    y_train = []
    for region in regions:
        X_train_i = []
        y_train_i = []
        df = pd.read_csv(f'{DATA_PATH}/2_regions/{project}/{region}')
        df.timestamp = df.timestamp.astype('datetime64[ns]')
        region_labels = [(datetime.datetime.strptime(label['start'], '%Y-%m-%d %H:%M:%S.%f'),datetime.datetime.strptime(label['end'], '%Y-%m-%d %H:%M:%S.%f')) for label in labels]
        region_labels = [label for label in region_labels if ((label[0] > df.timestamp.min()) & (label[1] < df.timestamp.max()))]
        df['y_true'] = 0
        for label in region_labels:
            df.loc[((df.timestamp > label[0]) & (df.timestamp < label[1])),'y_true'] = 1
        # df_resampled = df.set_index('timestamp').resample('20ms').mean().reset_index()

        df_resampled = df.copy()
        X = torch.from_numpy(df_resampled[['x','y','z']].values).float()
        y = torch.from_numpy(df_resampled['y_true'].values).float()
        for i in range(0,len(X) - windowsize,stride):
            X_train_i.append(X[i:i+windowsize])
            y_train_i.append(y[i + (windowsize // 2)])

        X_train_i = torch.stack(X_train_i).transpose(1,2)
        y_train_i = torch.tensor(y_train_i).reshape(-1,1).float()

        if torch.where(y_train_i == 1)[0].shape[0] == 0:
            continue
        if balance:
            idx_0 = torch.where(y_train_i == 0)[0]
            idx_0 = idx_0[torch.randperm(len(idx_0))[:torch.bincount(y_train_i.flatten().long())[1]]]
            idx_1 = torch.where(y_train_i == 1)[0]
            idx = torch.cat([idx_0,idx_1])
            X_train_i,y_train_i = X_train_i[idx],y_train_i[idx]
        X_train.append(X_train_i)
        y_train.append(y_train_i)
    X_train = torch.vstack(X_train)
    y_train = torch.vstack(y_train)
    print(X_train.shape,y_train.shape)
    torch.save((X_train,y_train),f'{project}.pt')