In [17]:
"""
Clean the recorded data from generate_training_data.py file

Requires:  'data/gesture_data.csv' file

Author: Rajat Bisht
version: 1.0

"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt



In [56]:
# load the dataset
file_path = "../data/gesture_data.csv"
df = pd.read_csv(file_path, index_col=False)

In [57]:
# display basic informastion about the dataset
print("+----------------------------------")
print(f"| Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns}")             # 21 landmarks x 3 coordinates = 63 + [label] = > 64 columns
print("+----------------------------------")

+----------------------------------
| Dataset shape: (4621, 64)
Dataset columns: Index(['label', '0_x', '0_y', '0_z', '1_x', '1_y', '1_z', '2_x', '2_y', '2_z',
       '3_x', '3_y', '3_z', '4_x', '4_y', '4_z', '5_x', '5_y', '5_z', '6_x',
       '6_y', '6_z', '7_x', '7_y', '7_z', '8_x', '8_y', '8_z', '9_x', '9_y',
       '9_z', '10_x', '10_y', '10_z', '11_x', '11_y', '11_z', '12_x', '12_y',
       '12_z', '13_x', '13_y', '13_z', '14_x', '14_y', '14_z', '15_x', '15_y',
       '15_z', '16_x', '16_y', '16_z', '17_x', '17_y', '17_z', '18_x', '18_y',
       '18_z', '19_x', '19_y', '19_z', '20_x', '20_y', '20_z'],
      dtype='object')
+----------------------------------


In [42]:
# 1: handeling missing values (if any) by dropping them
df.dropna(inplace=True)



In [43]:
# 2: drop duplicates
df.drop_duplicates(inplace=True)



In [None]:
# # 3: encode labels (if not numeric)
# label_enc = LabelEncoder()
# df['label'] = label_enc.fit_transform(df['label'])



In [None]:
print(df.head())


   label       0_x       0_y           0_z       1_x       1_y       1_z  \
0      0  0.696454  0.636997 -2.310000e-07  0.668111  0.614614 -0.007457   
1      0  0.700438  0.638974 -2.410000e-07  0.671682  0.616348 -0.008217   
2      0  0.702995  0.634602 -2.450000e-07  0.674380  0.614698 -0.008579   
3      0  0.708186  0.628546 -2.520000e-07  0.679156  0.609977 -0.009174   
4      0  0.712229  0.615178 -2.420000e-07  0.683351  0.594831 -0.007945   

        2_x       2_y       2_z  ...      17_z      18_x      18_y      18_z  \
0  0.648974  0.566744 -0.010450  ... -0.012280  0.715679  0.516302 -0.020629   
1  0.652554  0.569284 -0.011798  ... -0.012847  0.719245  0.517477 -0.022022   
2  0.654442  0.565782 -0.012038  ... -0.013593  0.722781  0.512908 -0.023170   
3  0.658345  0.555867 -0.011465  ... -0.009632  0.727873  0.507419 -0.018593   
4  0.664153  0.545000 -0.010325  ... -0.010626  0.733713  0.497498 -0.020006   

       19_x      19_y      19_z      20_x      20_y      20_z 

In [54]:
print(df['label'].describe())

count    4621.000000
mean        2.584938
std         2.179097
min         0.000000
25%         0.000000
50%         3.000000
75%         5.000000
max         6.000000
Name: label, dtype: float64


In [45]:
df['label'].value_counts()

label
0    1367
5     622
4     593
6     569
3     543
2     493
1     434
Name: count, dtype: int64

In [58]:
# 4: seperating features and labels
y = df['label']
X = df.drop(columns=['label'])

print(y.shape)
print(X.shape)



(4621,)
(4621, 63)


In [61]:
y.value_counts()

label
0    1367
5     622
4     593
6     569
3     543
2     493
1     434
Name: count, dtype: int64

In [59]:
# 5: split data into training(70%) and test(30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)



In [60]:
X_train.shape

(3234, 63)

In [62]:
y_train.value_counts()

label
0    937
4    449
5    436
6    392
3    378
2    344
1    298
Name: count, dtype: int64

In [63]:
# 6: Split testing data into evaluation(50% of test) and validation(50% of test)
X_eval, X_val, y_eval, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True)


In [64]:
print(f"x_eval.shape: {X_eval.shape}")

x_eval.shape: (693, 63)


In [65]:
# save the data into speprate files
train_data_file = "../data/train_gesture_data.csv"
eval_data_file = "../data/eval_gesture_data.csv"
val_data_file = "../data/val_gesture_data.csv"

# saving trainig data
train_df = pd.DataFrame(X_train)
train_df['label'] = y_train
train_df.to_csv(train_data_file, index=False)

# Save evaluation data
eval_df = pd.DataFrame(X_eval)
eval_df['label'] = y_eval
eval_df.to_csv(eval_data_file, index=False)

# Save validation data
val_df = pd.DataFrame(X_val)
val_df['label'] = y_val
val_df.to_csv(val_data_file, index=False)


In [66]:
# print data information
print(f"Training data saved to {train_data_file}")
print(f"Evaluation data saved to {eval_data_file}")
print(f"Validation data saved to {val_data_file}")

# Example output of shape after saving
print(f"Training set shape: {X_train.shape}")
print(f"Evaluation set shape: {X_eval.shape}")
print(f"Validation set shape: {X_val.shape}")

Training data saved to ../data/train_gesture_data.csv
Evaluation data saved to ../data/eval_gesture_data.csv
Validation data saved to ../data/val_gesture_data.csv
Training set shape: (3234, 63)
Evaluation set shape: (693, 63)
Validation set shape: (694, 63)
