### CREATE .h5 data

In [20]:
import h5py as h5 ###* A library to read and write hdf5 files
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle

In [21]:
## Load the data

data_path = '..\\..\\notebooks\\dinamyc_signs_analysis\\dinamycs_clean_data.pkl'
with open(data_path, "rb") as file:
    data = pickle.load(file)

print("Dict Loaded correctly:")
print('Number of samples per sign')
for key in data.keys():
    print(f' {key}: {len(data[key])}')


Dict Loaded correctly:
Number of samples per sign
 HOLA: 187
 J: 187
 K: 178
 NN: 183
 X: 159
 Z: 188


In [22]:

data['HOLA'][0].head()

Unnamed: 0,cx,cxROI,cy,cyROI
0,445,120,418,202
1,491,133,389,188
2,522,141,344,166
3,549,148,313,151
4,572,155,295,142


### We have for each sign a sequence of positions 

In [23]:
data['HOLA'][0].tail()

Unnamed: 0,cx,cxROI,cy,cyROI
1654,392,109,367,113
1655,347,97,349,107
1656,347,97,329,101
1657,364,101,351,108
1658,376,105,369,113


In [24]:
# Initial parameters
n_points = 21  # landmarks per frame
t_total = 2  # total time of the sign in seconds
average_frames = 71
truncate_data = {}

for name_sign, values in data.items():  # Corrección en 'name_sing'
    n_samples = len(values)
    truncate_df = []  # Inicializar como lista para almacenar los DataFrames truncados
    for df in values:
        n_frames = len(df) // n_points
        if n_frames >= average_frames:
            # Truncar el DataFrame si tiene más frames de los necesarios
            truncate_df.append(df[:average_frames * n_points])
        elif n_frames < average_frames:
            # Número de filas que faltan para alcanzar el tamaño deseado
            missing_rows = average_frames * n_points - len(df)

            # Tomar las últimas 21 filas de df
            last_values = df.iloc[-21:]

            # Repetir las últimas 21 filas hasta completar las filas faltantes
            repeated_values = pd.concat([last_values] * (missing_rows // 21), ignore_index=True)

            # Si el número faltante no es múltiplo de 21, añadir las filas restantes
            if missing_rows % 21 != 0:
                repeated_values = pd.concat(
                    [repeated_values, last_values.iloc[:missing_rows % 21]], ignore_index=True
                )

            # Concatenar el DataFrame original con las filas repetidas
            df_completed = pd.concat([df, repeated_values], ignore_index=True)
            truncate_df.append(df_completed)  # Agregar el DataFrame completo a la lista

    # Almacenar los DataFrames truncados o completados en el diccionario
    truncate_data[name_sign] = truncate_df


In [25]:
repeated_values.shape

(126, 4)

In [26]:
truncate_data['HOLA'][2].tail(42)

Unnamed: 0,cx,cxROI,cy,cyROI
1449,544,117,249,116
1450,586,126,243,114
1451,627,135,224,105
1452,660,142,212,99
1453,686,148,208,97
1454,611,132,161,75
1455,629,136,122,57
1456,639,138,96,45
1457,647,140,73,34
1458,587,127,154,72


In [27]:
truncate_data['HOLA'][2].head(21)


Unnamed: 0,cx,cxROI,cy,cyROI
0,577,127,264,122
1,617,136,259,120
2,658,145,241,112
3,690,152,233,108
4,715,158,230,107
5,647,143,183,85
6,667,147,149,69
7,678,149,126,58
8,687,151,105,48
9,625,138,175,81


In [28]:
truncate_data['HOLA'][2].to_numpy().flatten()[0:84]

array([577, 127, 264, 122, 617, 136, 259, 120, 658, 145, 241, 112, 690,
       152, 233, 108, 715, 158, 230, 107, 647, 143, 183,  85, 667, 147,
       149,  69, 678, 149, 126,  58, 687, 151, 105,  48, 625, 138, 175,
        81, 649, 143, 137,  63, 666, 147, 113,  52, 679, 150,  91,  42,
       599, 132, 176,  81, 613, 135, 155,  72, 610, 134, 186,  86, 606,
       133, 210,  97, 572, 126, 182,  84, 584, 129, 169,  78, 585, 129,
       195,  90, 583, 128, 214,  99], dtype=int64)

### We apply a flatten and we obtain [cx_1,cxROI_1,cy_1,cy_ROI_1, .... , ... ,  cx_n,cxROI_n,cy_n,cy_ROI_n]

In [29]:
truncate_data['HOLA'][2].to_numpy().flatten().shape

(5964,)

In [30]:
truncate_data['HOLA'][2].to_numpy().flatten().reshape(average_frames,84).shape

(71, 84)

### 75 is the number of sequences Tx and 84 the number of features

In [31]:
truncate_data['HOLA'][2].to_numpy().flatten().reshape(average_frames,84)[0,:] # first frame of the sign

### This is the features of the sign HOLA, the first frame.

array([577, 127, 264, 122, 617, 136, 259, 120, 658, 145, 241, 112, 690,
       152, 233, 108, 715, 158, 230, 107, 647, 143, 183,  85, 667, 147,
       149,  69, 678, 149, 126,  58, 687, 151, 105,  48, 625, 138, 175,
        81, 649, 143, 137,  63, 666, 147, 113,  52, 679, 150,  91,  42,
       599, 132, 176,  81, 613, 135, 155,  72, 610, 134, 186,  86, 606,
       133, 210,  97, 572, 126, 182,  84, 584, 129, 169,  78, 585, 129,
       195,  90, 583, 128, 214,  99], dtype=int64)

### We had 84 features (positions) per sequence or per frame 

In [32]:
# Initial parameters
n_points = 21  # landmarks per frame
average_frames = 71  ## Result from analisis of the datan
total_points = average_frames * n_points ### Total number of points in each dataframe
features = 84  ## Number of features in each dataframe

data_corrected = {}

for name_sing, values in truncate_data.items():
    
    n_samples = len(values) ### Number of samples for each sign
    print(f'Number of samples for the sign {name_sing} is {n_samples}')
    df_array_saved = np.zeros((1,average_frames,features)) ### Array to store
    
    for df in values:
        df_array = np.round(df.to_numpy().flatten()).astype(int).reshape(1, average_frames, features) ### Reshape the dataframe (m,Tx,features)
        df_array_saved = np.concatenate((df_array_saved,df_array), axis = 0) ### Concatenate the data
        
    df_array_saved = df_array_saved[1:,:,:] ### Remove the first row of zeros
    data_corrected[name_sing] = df_array_saved ### Save the data


        
        
        
        
        
        
        
    
    


Number of samples for the sign HOLA is 187
Number of samples for the sign J is 187
Number of samples for the sign K is 178
Number of samples for the sign NN is 183
Number of samples for the sign X is 159
Number of samples for the sign Z is 188


In [33]:
for key in data_corrected.keys():
    print(f' {key}: {data_corrected[key].shape}')

 HOLA: (187, 71, 84)
 J: (187, 71, 84)
 K: (178, 71, 84)
 NN: (183, 71, 84)
 X: (159, 71, 84)
 Z: (188, 71, 84)


In [34]:
data_corrected['HOLA'][2,0,:]

array([577., 127., 264., 122., 617., 136., 259., 120., 658., 145., 241.,
       112., 690., 152., 233., 108., 715., 158., 230., 107., 647., 143.,
       183.,  85., 667., 147., 149.,  69., 678., 149., 126.,  58., 687.,
       151., 105.,  48., 625., 138., 175.,  81., 649., 143., 137.,  63.,
       666., 147., 113.,  52., 679., 150.,  91.,  42., 599., 132., 176.,
        81., 613., 135., 155.,  72., 610., 134., 186.,  86., 606., 133.,
       210.,  97., 572., 126., 182.,  84., 584., 129., 169.,  78., 585.,
       129., 195.,  90., 583., 128., 214.,  99.])

### Let's to save the data into a h5 file

In [35]:
with h5.File('..\\features\\positions_dynamics.h5', 'w') as h5file:
    for letter, positions in data_corrected.items():
       
        grp = h5file.create_group(letter)
        grp.create_dataset('positions', data=positions)

In [36]:
dataset = h5.File('..\\features\\positions_dynamics.h5', 'r')
type(dataset)

h5py._hl.files.File

In [37]:
with h5.File('..\\features\\positions_dynamics.h5', 'r') as h5file:
    data = []
    labels = []
    for letter in h5file.keys():
        positions = h5file[letter]['positions'][:]
        data.append(positions)
        print(positions.shape)
        labels.extend([letter]  )

labels = np.array(labels)
labels = np.reshape(labels, (labels.shape[0], 1))


(187, 71, 84)
(187, 71, 84)
(178, 71, 84)
(183, 71, 84)
(159, 71, 84)
(188, 71, 84)


In [38]:
labels

array([['HOLA'],
       ['J'],
       ['K'],
       ['NN'],
       ['X'],
       ['Z']], dtype='<U4')