### CREATE .h5 data

In [7]:
import h5py as h5 ###* A library to read and write hdf5 files
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle

In [8]:
## Load the data

data_path = '..\\..\\notebooks\\dinamyc_signs_analysis\\dinamycs_clean_data.pkl'
with open(data_path, "rb") as file:
    data = pickle.load(file)

print("Dict Loaded correctly:")
print('Number of samples per sign')
for key in data.keys():
    print(f' {key}: {len(data[key])}')


Dict Loaded correctly:
Number of samples per sign
 23: 200
 BUENO: 200
 HOLA: 199
 MAL: 200
 NO: 200
 NOMBRE: 200
 QUETALL: 200
 SI: 200
 VEINTICUATRO: 201
 YO: 202


In [9]:

data['HOLA'][0].head()

Unnamed: 0,cx,cxROI,cy,cyROI
0,385,0,336,165
1,438,52,337,166
2,479,93,313,142
3,497,111,293,122
4,512,126,273,102


### We have for each sign a sequence of positions 

In [10]:
data['HOLA'][0].tail()

Unnamed: 0,cx,cxROI,cy,cyROI
625,290,75,282,196
626,214,0,214,128
627,275,60,250,164
628,272,57,283,197
629,252,37,281,195


In [11]:
# Initial parameters
n_points = 21  # landmarks per frame
t_total = 2  # total time of the sign in seconds
average_frames = 30
truncate_data = {}

for name_sign, values in data.items():  # Corrección en 'name_sing'
    n_samples = len(values)
    truncate_df = []  # Inicializar como lista para almacenar los DataFrames truncados
    for df in values:
        n_frames = len(df) // n_points
        if n_frames >= average_frames:
            # Truncar el DataFrame si tiene más frames de los necesarios
            truncate_df.append(df[:average_frames * n_points])
        elif n_frames < average_frames:
            # Número de filas que faltan para alcanzar el tamaño deseado
            missing_rows = average_frames * n_points - len(df)

            # Tomar las últimas 21 filas de df
            last_values = df.iloc[-21:]

            # Repetir las últimas 21 filas hasta completar las filas faltantes
            repeated_values = pd.concat([last_values] * (missing_rows // 21), ignore_index=True)

            # Si el número faltante no es múltiplo de 21, añadir las filas restantes
            if missing_rows % 21 != 0:
                repeated_values = pd.concat(
                    [repeated_values, last_values.iloc[:missing_rows % 21]], ignore_index=True
                )

            # Concatenar el DataFrame original con las filas repetidas
            df_completed = pd.concat([df, repeated_values], ignore_index=True)
            truncate_df.append(df_completed)  # Agregar el DataFrame completo a la lista

    # Almacenar los DataFrames truncados o completados en el diccionario
    truncate_data[name_sign] = truncate_df


In [12]:
truncate_data['HOLA'][2].tail(42)

Unnamed: 0,cx,cxROI,cy,cyROI
588,428,85,301,233
589,484,141,255,187
590,518,175,196,128
591,544,201,159,91
592,569,226,135,67
593,446,103,149,81
594,440,97,111,43
595,435,92,86,18
596,432,89,67,0
597,407,64,160,92


In [13]:
truncate_data['HOLA'][2].head(21)


Unnamed: 0,cx,cxROI,cy,cyROI
0,697,0,278,113
1,735,37,282,117
2,778,80,267,102
3,804,106,256,91
4,821,123,248,83
5,775,77,206,41
6,808,110,190,25
7,832,134,182,17
8,853,155,175,10
9,757,59,190,25


In [14]:
truncate_data['HOLA'][2].to_numpy().flatten()[0:84]

array([697,   0, 278, 113, 735,  37, 282, 117, 778,  80, 267, 102, 804,
       106, 256,  91, 821, 123, 248,  83, 775,  77, 206,  41, 808, 110,
       190,  25, 832, 134, 182,  17, 853, 155, 175,  10, 757,  59, 190,
        25, 796,  98, 169,   4, 827, 129, 166,   1, 851, 153, 164,   0,
       734,  36, 184,  19, 768,  70, 199,  34, 756,  58, 229,  64, 742,
        44, 245,  80, 712,  14, 186,  21, 740,  42, 206,  41, 734,  36,
       230,  65, 725,  27, 242,  77], dtype=int64)

### We apply a flatten and we obtain [cx_1,cxROI_1,cy_1,cy_ROI_1, .... , ... ,  cx_n,cxROI_n,cy_n,cy_ROI_n]

In [15]:
truncate_data['HOLA'][2].to_numpy().flatten().shape

(2520,)

In [16]:
truncate_data['HOLA'][2].to_numpy().flatten().reshape(average_frames,84).shape

(30, 84)

### 75 is the number of sequences Tx and 84 the number of features

In [17]:
truncate_data['HOLA'][2].to_numpy().flatten().reshape(average_frames,84)[0,:] # first frame of the sign

### This is the features of the sign HOLA, the first frame.

array([697,   0, 278, 113, 735,  37, 282, 117, 778,  80, 267, 102, 804,
       106, 256,  91, 821, 123, 248,  83, 775,  77, 206,  41, 808, 110,
       190,  25, 832, 134, 182,  17, 853, 155, 175,  10, 757,  59, 190,
        25, 796,  98, 169,   4, 827, 129, 166,   1, 851, 153, 164,   0,
       734,  36, 184,  19, 768,  70, 199,  34, 756,  58, 229,  64, 742,
        44, 245,  80, 712,  14, 186,  21, 740,  42, 206,  41, 734,  36,
       230,  65, 725,  27, 242,  77], dtype=int64)

### We had 84 features (positions) per sequence or per frame 

In [18]:
# Initial parameters
n_points = 21  # landmarks per frame
average_frames = 30 ## Result from analisis of the datan
total_points = average_frames * n_points ### Total number of points in each dataframe
features = 84  ## Number of features in each dataframe

data_corrected = {}

for name_sing, values in truncate_data.items():
    
    n_samples = len(values) ### Number of samples for each sign
    print(f'Number of samples for the sign {name_sing} is {n_samples}')
    df_array_saved = np.zeros((1,average_frames,features)) ### Array to store
    
    for df in values:
        df_array = np.round(df.to_numpy().flatten()).astype(int).reshape(1, average_frames, features) ### Reshape the dataframe (m,Tx,features)
        df_array_saved = np.concatenate((df_array_saved,df_array), axis = 0) ### Concatenate the data
        
    df_array_saved = df_array_saved[1:,:,:] ### Remove the first row of zeros
    data_corrected[name_sing] = df_array_saved ### Save the data


        
        
        
        
        
        
        
    
    


Number of samples for the sign 23 is 200
Number of samples for the sign BUENO is 200
Number of samples for the sign HOLA is 199
Number of samples for the sign MAL is 200
Number of samples for the sign NO is 200
Number of samples for the sign NOMBRE is 200
Number of samples for the sign QUETALL is 200
Number of samples for the sign SI is 200
Number of samples for the sign VEINTICUATRO is 201
Number of samples for the sign YO is 202


In [19]:
for key in data_corrected.keys():
    print(f' {key}: {data_corrected[key].shape}')

 23: (200, 30, 84)
 BUENO: (200, 30, 84)
 HOLA: (199, 30, 84)
 MAL: (200, 30, 84)
 NO: (200, 30, 84)
 NOMBRE: (200, 30, 84)
 QUETALL: (200, 30, 84)
 SI: (200, 30, 84)
 VEINTICUATRO: (201, 30, 84)
 YO: (202, 30, 84)


In [20]:
data_corrected['HOLA'][2,0,:]

array([697.,   0., 278., 113., 735.,  37., 282., 117., 778.,  80., 267.,
       102., 804., 106., 256.,  91., 821., 123., 248.,  83., 775.,  77.,
       206.,  41., 808., 110., 190.,  25., 832., 134., 182.,  17., 853.,
       155., 175.,  10., 757.,  59., 190.,  25., 796.,  98., 169.,   4.,
       827., 129., 166.,   1., 851., 153., 164.,   0., 734.,  36., 184.,
        19., 768.,  70., 199.,  34., 756.,  58., 229.,  64., 742.,  44.,
       245.,  80., 712.,  14., 186.,  21., 740.,  42., 206.,  41., 734.,
        36., 230.,  65., 725.,  27., 242.,  77.])

### Let's to save the data into a h5 file

In [21]:
with h5.File('..\\features\\positions_dynamics.h5', 'w') as h5file:
    for letter, positions in data_corrected.items():
       
        grp = h5file.create_group(letter)
        grp.create_dataset('positions', data=positions)

In [22]:
dataset = h5.File('..\\features\\positions_dynamics.h5', 'r')
type(dataset)

h5py._hl.files.File

In [23]:
with h5.File('..\\features\\positions_dynamics.h5', 'r') as h5file:
    data = []
    labels = []
    for letter in h5file.keys():
        positions = h5file[letter]['positions'][:]
        data.append(positions)
        print(positions.shape)
        labels.extend([letter]  )

labels = np.array(labels)
labels = np.reshape(labels, (labels.shape[0], 1))


(200, 30, 84)
(200, 30, 84)
(199, 30, 84)
(200, 30, 84)
(200, 30, 84)
(200, 30, 84)
(200, 30, 84)
(200, 30, 84)
(201, 30, 84)
(202, 30, 84)


In [24]:
labels

array([['23'],
       ['BUENO'],
       ['HOLA'],
       ['MAL'],
       ['NO'],
       ['NOMBRE'],
       ['QUETALL'],
       ['SI'],
       ['VEINTICUATRO'],
       ['YO']], dtype='<U12')