In [None]:
"""
Clean the recorded data from generate_training_data.py file

Requires:  'data/gesture_data.csv' file

Author: Rajat Bisht
version: 1.0

"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt



In [3]:
# load the dataset
file_path = "../data/gesture_data.csv"
df = pd.read_csv(file_path)

In [4]:
# display basic informastion about the dataset
print("+----------------------------------")
print(f"| Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns}")
print("+----------------------------------")

+----------------------------------
| Dataset shape: (4621, 61)
Dataset columns: Index(['label', '0_x', '0_y', '0_z', '1_x', '1_y', '1_z', '2_x', '2_y', '2_z',
       '3_x', '3_y', '3_z', '4_x', '4_y', '4_z', '5_x', '5_y', '5_z', '6_x',
       '6_y', '6_z', '7_x', '7_y', '7_z', '8_x', '8_y', '8_z', '9_x', '9_y',
       '9_z', '10_x', '10_y', '10_z', '11_x', '11_y', '11_z', '12_x', '12_y',
       '12_z', '13_x', '13_y', '13_z', '14_x', '14_y', '14_z', '15_x', '15_y',
       '15_z', '16_x', '16_y', '16_z', '17_x', '17_y', '17_z', '18_x', '18_y',
       '18_z', '19_x', '19_y', '19_z'],
      dtype='object')
+----------------------------------


In [5]:
# 1: handeling missing values (if any) by dropping them
df.dropna(inplace=True)



In [6]:
# 2: drop duplicates
df.drop_duplicates(inplace=True)



In [7]:
# 3: encode labels (if not numeric)
label_enc = LabelEncoder()
df['label'] = label_enc.fit_transform(df['label'])



In [8]:
# 4: seperating features and labels
y = df['label'].values
X = df.drop(columns=['label']).values

print(y.shape)
print(X.shape)



(4621,)
(4621, 60)


In [9]:
# 5: split data into training(70%) and test(30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)



In [10]:
# 6: Split testing data into evaluation(50% of test) and validation(50% of test)
X_eval, X_val, y_eval, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True)


In [11]:
# save the data into speprate files
train_data_file = "../data/train_gesture_data.csv"
eval_data_file = "../data/eval_gesture_data.csv"
val_data_file = "../data/val_gesture_data.csv"

# saving trainig data
train_df = pd.DataFrame(X_train)
train_df['label'] = y_train
train_df.to_csv(train_data_file, index=False)

# Save evaluation data
eval_df = pd.DataFrame(X_eval)
eval_df['label'] = y_eval
eval_df.to_csv(eval_data_file, index=False)

# Save validation data
val_df = pd.DataFrame(X_val)
val_df['label'] = y_val
val_df.to_csv(val_data_file, index=False)


In [13]:
# print data information
print(f"Training data saved to {train_data_file}")
print(f"Evaluation data saved to {eval_data_file}")
print(f"Validation data saved to {val_data_file}")

# Example output of shape after saving
print(f"Training set shape: {X_train.shape}")
print(f"Evaluation set shape: {X_eval.shape}")
print(f"Validation set shape: {X_val.shape}")

Training data saved to ../data/train_gesture_data.csv
Evaluation data saved to ../data/eval_gesture_data.csv
Validation data saved to ../data/val_gesture_data.csv
Training set shape: (3234, 60)
Evaluation set shape: (693, 60)
Validation set shape: (694, 60)
