# **Testing DataFrame preprocessing & transformation functions**

*Not the pipeline just a testing field*

---

In [2]:
import pandas as pd
from pathlib import Path
from preprocessing_functions import (
    reindex_frames,
    handle_missing_face,
    handle_missing_hands,
    fill_missing_hands,
    drop_sequences_by_missing_face,
    resample_sequence,
    center_around_nose,
    scale_by_shoulder_width
)

In [2]:
df_data = pd.read_csv('../asl-signs/train.csv')
path = Path('../asl-signs')

## **Frame Reindexing**

In [3]:
df = pd.read_parquet('../asl-signs/train_landmark_files/2044/635217.parquet')

# ~~~~~~~~~~~~~~

df_frames = df['frame'].unique()
print("Frames before: ", df_frames)

df_after = reindex_frames(df)

df_after_frames = df_after['frame'].unique()
print("Frames after: ", df_after_frames)

Frames before:  [22 23 24 25 26 27 28]
Frames after:  [0 1 2 3 4 5 6]


## **Handle Missing Face**

In [4]:
for idx, row in df_data.iterrows():
    path = Path('../asl-signs') / row['path']
    seq = pd.read_parquet(path)
    face_data = seq[seq['type'] == 'face']
    
    if face_data['x'].isna().any() and not face_data['x'].isna().all():
        print(row)
        break

path              train_landmark_files/37055/100035691.parquet
participant_id                                           37055
sequence_id                                          100035691
sign                                                    flower
Name: 8, dtype: object


In [5]:
df = pd.read_parquet('../asl-signs/train_landmark_files/37055/100035691.parquet')

# ~~~~~~~~~~~~~~~~
face_before = df[df['type'] == 'face']
print(f'Missing x,y,z: {face_before[['x', 'y', 'z']].isnull().sum()}')

df = handle_missing_face(df)

face_after = df[df['type'] == 'face']
print(f'Missing x,y,z: {face_after[['x', 'y', 'z']].isnull().sum()}')

Missing x,y,z: x    468
y    468
z    468
dtype: int64
Missing x,y,z: x    0
y    0
z    0
dtype: int64


## **Handle Missing Hands**

In [6]:
for idx, row in df_data.iterrows():
    path = Path('../asl-signs') / row['path']
    seq = pd.read_parquet(path)

    left_hand = seq[seq['type'] == 'left_hand']
    right_hand = seq[seq['type'] == 'right_hand']

    left_hand_has_some_missing = left_hand['x'].isna().any() and not left_hand['x'].isna().all()
    right_hand_has_some_missing = right_hand['x'].isna().any() and not right_hand['x'].isna().all()

    frame = seq['frame'].nunique()
    
    if left_hand_has_some_missing and right_hand_has_some_missing:
        print(row)
        break

path              train_landmark_files/34503/100137027.parquet
participant_id                                           34503
sequence_id                                          100137027
sign                                                     quiet
Name: 24, dtype: object


In [7]:
df = pd.read_parquet('../asl-signs/train_landmark_files/34503/100137027.parquet')

# ~~~~~~~~~~~~~~~~
print(f'All hand frames: {df['frame'].nunique() * 21}')

left_hand_before = df[df['type'] == 'left_hand']
right_hand_before = df[df['type'] == 'right_hand']
print(f'Missing left hand x,y,z: {left_hand_before[['x', 'y', 'z']].isnull().sum()}')
print(f'Missing right hand x,y,z: {right_hand_before[['x', 'y', 'z']].isnull().sum()}')

df = handle_missing_hands(df)

left_hand_after = df[df['type'] == 'left_hand']
right_hand_after = df[df['type'] == 'right_hand']
print(f'Missing left hand x,y,z: {left_hand_after[['x', 'y', 'z']].isnull().sum()}')
print(f'Missing right hand x,y,z: {right_hand_after[['x', 'y', 'z']].isnull().sum()}')

All hand frames: 546
Missing left hand x,y,z: x    504
y    504
z    504
dtype: int64
Missing right hand x,y,z: x    462
y    462
z    462
dtype: int64
Missing left hand x,y,z: x    0
y    0
z    0
dtype: int64
Missing right hand x,y,z: x    0
y    0
z    0
dtype: int64


## **Fill Missing Hands**

In [8]:
for idx, row in df_data.iterrows():
    path = Path('../asl-signs') / row['path']
    seq = pd.read_parquet(path)

    #left_hand = seq[seq['type'] == 'left_hand']
    right_hand = seq[seq['type'] == 'right_hand']

    #left_hand_missing = left_hand['x'].isna().all()
    right_hand_missing = right_hand['x'].isna().all()
    
    if right_hand_missing:
        print(row)
        break

path              train_landmark_files/16069/100015657.parquet
participant_id                                           16069
sequence_id                                          100015657
sign                                                     cloud
Name: 2, dtype: object


In [9]:
df = pd.read_parquet('../asl-signs/train_landmark_files/16069/100015657.parquet')

# ~~~~~~~~~~~~~~~~
print(f'All right hand frames: {df['frame'].nunique() * 21}')

right_hand_before = df[df['type'] == 'right_hand']
print(f'Missing right hand x,y,z: {right_hand_before[['x', 'y', 'z']].isnull().sum()}')

df = fill_missing_hands(df)

right_hand_after = df[df['type'] == 'right_hand']
print(f'Missing right hand x,y,z: {right_hand_after[['x', 'y', 'z']].isnull().sum()}')

All right hand frames: 2205
Missing right hand x,y,z: x    2205
y    2205
z    2205
dtype: int64
Missing right hand x,y,z: x    0
y    0
z    0
dtype: int64


## **Drop Sequences By Missing Face & Dataset / METADATA SAVE**

In [None]:
df_data = pd.read_csv('../asl-signs/train.csv')
path = Path('../asl-signs')


df_metadata_clean = drop_sequences_by_missing_face(df_data = df_data, path = path, threshold = 0.5)
df_metadata_clean.to_csv('../dataset/train_clean_50_threshhold.csv')

Dropping sequences with more than >=50.0% face frames missing..: 100%|██████████| 94477/94477 [19:07<00:00, 82.36it/s] 



Sequences before: 94477
Dropping: 343 sequences (>50.0% missing face)
Sequences after: 94134


In [None]:
df_metadata_clean = drop_sequences_by_missing_face(df_data = df_data, path = path, threshold = 0.75)
df_metadata_clean.to_csv('../dataset/train_clean_75_threshhold.csv')

Dropping sequences with more than >=75.0% face frames missing..: 100%|██████████| 94477/94477 [20:48<00:00, 75.67it/s]



Sequences before: 94477
Dropping: 100 sequences (>75.0% missing face)
Sequences after: 94377


In [13]:
df_metadata_clean = drop_sequences_by_missing_face(df_data = df_data, path = path, threshold = 1.0)
df_metadata_clean.to_csv('../dataset/train_clean_100_threshhold.csv')

Dropping sequences with more than >=100.0% face frames missing..: 100%|██████████| 94477/94477 [20:55<00:00, 75.25it/s] 


Sequences before: 94477
Dropping: 26 sequences (>100.0% missing face)
Sequences after: 94451





## **Resample sequence**

In [6]:
df_clean_metadata = pd.read_csv('../dataset/train_clean_50_threshhold.csv')
seq = df_clean_metadata.iloc[0]


df = pd.read_parquet(Path('../asl-signs') / seq['path'])

df = reindex_frames(df)
df = handle_missing_face(df)
df = handle_missing_hands(df)
df = center_around_nose(df)
df = scale_by_shoulder_width(df)
df = fill_missing_hands(df)

print(df['frame'].unique())
df = resample_sequence(df, target_frames = 60)
print(df['frame'].unique())

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59]


  df = df.groupby('frame', group_keys=False).apply(center_frame)
  df = df.groupby('frame', group_keys=False).apply(scale_frame)
