# **Preprocessing sequences before passing to dataset**

In [None]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os

from preprocessing.preprocessing_functions import (
    reindex_frames,
    handle_missing_face,
    handle_missing_hands,
    fill_missing_hands,
    drop_sequences_by_missing_face,
    resample_sequence,
    center_around_nose,
    scale_by_shoulder_width,
    scale_by_shoulder_width_2,
    center_around_nose_2
)

## **1st attempt 250 classes with really low sequence count**

Horrible accuracy

In [4]:
df_metadata = pd.read_csv('dataset/train_clean_50_threshhold.csv')
print(df_metadata['sign'].value_counts())
print(len(df_metadata))

df_subset = df_metadata.sample(n=len(df_metadata)//10, random_state=99)
#df_subset = df_metadata.groupby('sign', group_keys=False).apply(
#    lambda x: x.sample(frac=0.05, random_state=99)
#).reset_index(drop=True)
print(df_subset['sign'].value_counts())
print(len(df_subset))

#df_subset.to_csv('dataset/train_clean_50_threshhold_10_percent.csv', index=False)

sign
listen    415
look      412
donkey    410
hear      405
mouse     405
         ... 
dance     312
person    312
beside    310
vacuum    307
zipper    299
Name: count, Length: 250, dtype: int64
94134
sign
see        57
uncle      55
kitty      54
radio      53
grandpa    53
           ..
tooth      26
person     25
garbage    25
orange     24
horse      21
Name: count, Length: 250, dtype: int64
9413


In [None]:
df_metadata = pd.read_csv('dataset/train_clean_50_threshhold_10_percent.csv')
output_dir = Path('dataset/processed_30frame_test')
target_frames = 30


df_iter = tqdm(df_metadata.iterrows(), total=len(df_metadata), desc="Preprocessing")
for idx, row in df_iter:
    df = pd.read_parquet(Path('asl-signs') / row['path'])
    
    # all
    df = reindex_frames(df)

    df = handle_missing_face(df)
    df = handle_missing_hands(df)

    df = center_around_nose_2(df)
    df = scale_by_shoulder_width_2(df)

    df = fill_missing_hands(df)

    df = resample_sequence(df, target_frames=target_frames)
    
    # Save
    features = df[['x', 'y', 'z']].values.reshape(target_frames, -1)
    np.save(output_dir / f"{row['sequence_id']}.npy", features)


print('Preprocessing complete!')

Preprocessing:   0%|          | 0/9413 [00:00<?, ?it/s]

Preprocessing: 100%|██████████| 9413/9413 [54:00<00:00,  2.90it/s] 

Preprocessing complete!





In [None]:
folder = 'processed_30frame_test'

df_meta = pd.read_csv('dataset/train_clean_50_threshhold_10_percent.csv')

df_meta['path'] = df_meta['sequence_id'].apply(lambda sid: f'{folder}/{sid}.npy')

# save
df_meta.to_csv("dataset/train_clean_50_threshhold_10_percent_modified_path.csv", index=False)



### Training

Epoch 50/100                                                                 
Train Loss: 1.9671 | Train Acc: 0.4914
Val Loss:   4.8743 | Val Acc:   0.1817

Epoch 51/100                                                        
Train Loss: 1.9337 | Train Acc: 0.5039
Val Loss:   4.9290 | Val Acc:   0.1700

Epoch 52/100                                                                             
Train Loss: 1.9126 | Train Acc: 0.5092
Val Loss:   4.9257 | Val Acc:   0.1870

Horrible accuracy, overfitting, all in all bad but considering there are 250 overall signs and only 40-20 sequences for each

## **2nd attempt 25 signs and all of their sequences** 

In [6]:
df_metadata = pd.read_csv('dataset/train_clean_50_threshhold.csv')
print(df_metadata['sign'].value_counts().head(5))
print(len(df_metadata))

top_25_signs = df_metadata['sign'].value_counts().head(25).index.tolist()
df_subset = df_metadata[df_metadata['sign'].isin(top_25_signs)].reset_index(drop = True)

print(df_subset['sign'].value_counts().head(5))
print(len(df_subset))

df_subset.to_csv('dataset/train_clean_50_threshhold_25_signs_all.csv', index=False)

sign
listen    415
look      412
donkey    410
hear      405
mouse     405
Name: count, dtype: int64
94134
sign
listen    415
look      412
donkey    410
uncle     405
mouse     405
Name: count, dtype: int64
10051


In [None]:
df_metadata = pd.read_csv('dataset/train_clean_50_threshhold_25_signs_all.csv')
output_dir = Path('dataset/processed_60frame_25_signs')
os.makedirs(output_dir, exist_ok=True)

target_frames = 60


df_iter = tqdm(df_metadata.iterrows(), total=len(df_metadata), desc="Preprocessing")
for idx, row in df_iter:
    df = pd.read_parquet(Path('asl-signs') / row['path'])
    
    # all
    df = reindex_frames(df)

    df = handle_missing_face(df)
    df = handle_missing_hands(df)

    df = center_around_nose_2(df)
    df = scale_by_shoulder_width_2(df)

    df = fill_missing_hands(df)

    df = resample_sequence(df, target_frames=target_frames)
    
    # Save
    features = df[['x', 'y', 'z']].values.reshape(target_frames, -1)
    np.save(output_dir / f"{row['sequence_id']}.npy", features)


print('Preprocessing complete!')

Preprocessing: 100%|██████████| 10051/10051 [1:09:25<00:00,  2.41it/s]

Preprocessing complete!





In [None]:
folder = 'processed_60frame_25_signs'

df_meta = pd.read_csv('dataset/train_clean_50_threshhold_25_signs_all.csv')

df_meta['path'] = df_meta['sequence_id'].apply(lambda sid: f'{folder}/{sid}.npy')

# save
df_meta.to_csv('dataset/train_clean_50_threshhold_25_signs_all_modified_path.csv', index=False)

In [None]:

df_25_signs = pd.read_csv('dataset/train_clean_50_threshhold_25_signs_all_modified_path.csv')


unique_signs = sorted(df_25_signs['sign'].unique())
print(f'signs: {len(unique_signs)}')

label_mapping_25_signs = {sign: idx for idx, sign in enumerate(unique_signs)}

print('new mapping:')
print(label_mapping_25_signs)



with open('dataset/label_mapping_25.json', 'w') as f:
    json.dump(label_mapping_25_signs, f, indent = 4)

Number of signs: 25
New label mapping:
{'awake': 0, 'brown': 1, 'bye': 2, 'cat': 3, 'cow': 4, 'donkey': 5, 'duck': 6, 'fireman': 7, 'first': 8, 'hear': 9, 'lips': 10, 'listen': 11, 'look': 12, 'make': 13, 'mouse': 14, 'nuts': 15, 'pen': 16, 'pretend': 17, 'shhh': 18, 'sleepy': 19, 'think': 20, 'toothbrush': 21, 'uncle': 22, 'wake': 23, 'yesterday': 24}


In [3]:
df_25_signs['sign'].value_counts()

sign
listen        415
look          412
donkey        410
uncle         405
mouse         405
hear          405
cow           404
bye           402
lips          402
pretend       402
duck          401
wake          401
sleepy        401
fireman       400
toothbrush    400
shhh          400
think         399
pen           399
brown         399
awake         399
first         398
nuts          398
yesterday     398
cat           398
make          398
Name: count, dtype: int64