In [None]:
import os
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from dtaidistance import dtw as dtw_lib

# Tratamento dos Dados com Funções de Alinhamento

In [None]:
def get_input(*signals, path, max=100, min=10):
    input = {}
    gloss = pd.read_csv('glossary.csv')
    signals = signals if signals else gloss['SIGNAL'].values
    for signal in signals:
        dfs = []
        for root, _, files in os.walk(path):
            for file in files:
                if file.endswith('.parquet') and signal in file:
                    df = pd.read_parquet(os.path.join(root, file)).fillna(0)
                    if len(df) > max:
                        df = pd.DataFrame({column: np.interp(np.linspace(0, 1, max), np.linspace(0, 1, len(df)), df[column].values) for column in df.columns})
                    if len(df) > min:
                        dfs.append(df)
        input[signal] = dfs
    return input

def label_encoder(labels, path='glossary.csv'):
    gloss = pd.read_csv(path)
    return [int(gloss.loc[gloss['SIGNAL'] == label, 'ID'].values[0]) for label in labels]

def padding(input):
    X, y = [], []
    length = max([max([df.shape[0] for df in dfs]) for dfs in input.values()])
    for label, dfs in input.items():
        for df in dfs:
            padded_data = np.pad(df.values, ((0, length - df.shape[0]), (0, 0)), 'constant')
            X.append(padded_data)
            y.append(label)        
    X = torch.tensor(np.array(X), dtype=torch.float32)
    y = torch.tensor(np.array(label_encoder(y)), dtype=torch.long)
    return X, y

def interpolate(input):
    X, y = [], []
    length = int(round(np.mean([len(df) for dfs in input.values() for df in dfs])))
    for label, dfs in input.items():
        for df in dfs:
            if len(df) == length:
                X.append(df.values)
            else:
                X.append(pd.DataFrame({column: np.interp(np.linspace(0, 1, length), np.linspace(0, 1, len(df)), df[column].values) for column in df.columns}).values)
            y.append(label)
    X = torch.tensor(np.array(X), dtype=torch.float32)
    y = torch.tensor(np.array(label_encoder(y)), dtype=torch.long)
    return X, y

def dtw(input):
    X, y = [], []
    length = int(round(np.mean([len(df) for dfs in input.values() for df in dfs])))
    for label, dfs in input.items():
        for df in dfs:
            X.append(np.array([[df.values[i, col] if i < len(df.values) else df.values[-1, col] for i, _ in dtw_lib.warping_path(np.linspace(0, 1, len(df.values)), np.linspace(0, 1, length))] for col in range(df.values.shape[1])]).T[:length])
            y.append(label)
    X = torch.tensor(np.array(X), dtype=torch.float32)
    y = torch.tensor(np.array(label_encoder(y)), dtype=torch.long)
    return X, y

# Dataset de Tensores

In [None]:
class SignActionDataset(Dataset):
    def __init__(self, input, method:str):
        self.X, self.y = method(input)
        self.shape = tuple(self.X.shape)
        self.labels = list(set(np.array(self.y)))
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
INPUT_PATH = 'DATASET/LANDMARKS'
OUTPUT_PATH = 'DATASET/TENSORS'

input = get_input(path=INPUT_PATH)
interpolate_input = SignActionDataset(input, interpolate)
torch.save(interpolate_input, f'{OUTPUT_PATH}/interpolate.pt')
padding_input = SignActionDataset(input, padding)
torch.save(interpolate_input, f'{OUTPUT_PATH}/padding.pt')
dtw_input = SignActionDataset(input, dtw)
torch.save(interpolate_input, f'{OUTPUT_PATH}/dtw.pt')