In [1]:
data_folder = "quest_training_data/"

In [None]:
import tensorflow as tf
print("GPUs available:", tf.config.list_physical_devices('GPU'))

In [2]:
import pandas as pd
import numpy as np
import os
import glob # Library for finding files that match a pattern

def process_quest_file(file_path):
    """
    Loads a single data file, downsamples it, and calculates
    velocity and acceleration features.
    """
    # low_memory=False helps prevent data type warnings
    df = pd.read_csv(file_path, low_memory=False)
    
    # Downsample by a factor of 4 to speed up processing
    df = df.iloc[::2, :].copy()

    feature_cols = [
        'TimeStamp',
        'Meta_R_Index_Distal_GLOBAL_X',
        'Meta_R_Index_Distal_GLOBAL_Y',
        'Meta_R_Index_Distal_GLOBAL_Z'
    ]
    label_col = 'KeyPressFlag'
    
    # It's safer to check if columns exist before using them
    required_cols = feature_cols + [label_col]
    if not all(col in df.columns for col in required_cols):
        print(f"  -> Skipping {os.path.basename(file_path)}: missing required columns.")
        return None # Return nothing if a file is missing columns
        
    processed_df = df[required_cols].copy()
    
    delta_time = processed_df['TimeStamp'].diff()
    
    # Calculate Velocity
    processed_df['vel_x'] = processed_df['Meta_R_Index_Distal_GLOBAL_X'].diff() / delta_time
    processed_df['vel_y'] = processed_df['Meta_R_Index_Distal_GLOBAL_Y'].diff() / delta_time
    processed_df['vel_z'] = processed_df['Meta_R_Index_Distal_GLOBAL_Z'].diff() / delta_time
    
    # Calculate Acceleration
    processed_df['accel_x'] = processed_df['vel_x'].diff() / delta_time
    processed_df['accel_y'] = processed_df['vel_y'].diff() / delta_time
    processed_df['accel_z'] = processed_df['vel_z'].diff() / delta_time
    
    processed_df.dropna(inplace=True)
    
    return processed_df




# Use glob to find all .csv files recursively
# The '**' tells glob to search in all subdirectories
search_pattern = os.path.join(data_folder, '**', '*.csv')
all_files = glob.glob(search_pattern, recursive=True)

# A list to hold the processed data from each file
list_of_dfs = []

print(f"Found {len(all_files)} files to process...")

current_subfolder = None

for file in all_files:
    # We'll just print the filename, not the full path, to keep the log clean
    subfolder_name = os.path.basename(os.path.dirname(file))

    if subfolder_name != current_subfolder:
        current_subfolder = subfolder_name
        print(f"\n--- Processing subfolder: {current_subfolder}---")
    #print(f"Processing {os.path.basename(file)}...")
    try:
        processed_df = process_quest_file(file)
        if processed_df is not None:
            list_of_dfs.append(processed_df)
    except Exception as e:
        print(f"  -> ERROR processing {os.path.basename(file)}. Error: {e}")

# Combine all the processed data into one master DataFrame
if list_of_dfs:
    master_df = pd.concat(list_of_dfs, ignore_index=True)

    print("\n--- Processing Complete ---")
    print("Shape of the final master DataFrame:", master_df.shape)
    
    print("\nClass Distribution ('1' is a Tap):")
    # We check if 'KeyPressFlag' exists before trying to access it
    if 'KeyPressFlag' in master_df.columns:
        print(master_df['KeyPressFlag'].value_counts(normalize=True))
    else:
        print("Column 'KeyPressFlag' not found in the final DataFrame.")
else:
    print("\nNo files were processed. Please check your data_folder path and file contents.")

Found 11167 files to process...

--- Processing subfolder: ptx_06---

--- Processing subfolder: ptx_03---

--- Processing subfolder: ptx_01---

--- Processing subfolder: ptx_999---

--- Processing subfolder: ptx_02---

--- Processing Complete ---
Shape of the final master DataFrame: (598965, 11)

Class Distribution ('1' is a Tap):
KeyPressFlag
0    0.539741
1    0.460259
Name: proportion, dtype: float64


In [5]:
# Choose any one of your raw data files
single_file_path = 'quest_training_data/ptx_01/0_Master_ptx_01_0deg_6_boxing_n_13_166.80.csv'

# Load the raw file
df_sample = pd.read_csv(single_file_path, low_memory=False)

# Downsample it just like in your script
df_sample_downsampled = df_sample.iloc[::2, :].copy()

# Calculate the true average time delta on the downsampled data
true_avg_delta = df_sample_downsampled['TimeStamp'].diff().mean()

# Calculate the true window duration
window_size = 100
true_window_duration = true_avg_delta * window_size

print(f"Correct average time between frames (after downsampling): {true_avg_delta:.4f} seconds")
print(f"Correct estimated window duration: {true_window_duration:.2f} seconds")

Correct average time between frames (after downsampling): 0.0111 seconds
Correct estimated window duration: 1.11 seconds


In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix

# --- Step 1: Prepare Data for Windowing ---

# Define the features you want to use in your model
# We'll use position, velocity, and acceleration for the Z-axis (vertical)
feature_columns = [
    'Meta_R_Index_Distal_GLOBAL_Z',
    'vel_z',
    'accel_z'
]

# Create the final array with the label in the FIRST column
# This is the format the windowing function expects
timeseries_data = master_df[['KeyPressFlag'] + feature_columns].to_numpy()

avg_delta = master_df['TimeStamp'].diff().mean()
window_duration = avg_delta * window_size
print(f" Before windowing: Each window covers ~{window_duration:.3f} seconds, average delta: {avg_delta:.9f} seconds")

# --- Step 2: Create Time-Series Windows ---

def make_timeseries_instances(time_series, window_size):
    """Chops the data into overlapping windows."""
    X = []
    y = []
    for i in range(window_size, time_series.shape[0]):
        # The window is the sequence of features from the past
        X.append(time_series[i-window_size:i, 1:])
        # The label is the KeyPressFlag at the end of the window
        y.append(time_series[i, 0])
    return np.array(X), np.array(y).astype(int)

# Define how many past frames the model should see
window_size = 100

print("Creating time-series windows...")
X_windowed, y_windowed = make_timeseries_instances(timeseries_data, window_size)
print("Shape of X_windowed (samples, timesteps, features):", X_windowed.shape)
print("Shape of y_windowed:", y_windowed.shape)


# --- Step 3: Split and Scale the Data ---

# Stratified split is crucial for imbalanced data
X_train, X_test, y_train, y_test = train_test_split(
    X_windowed, y_windowed, test_size=0.2, random_state=42, stratify=y_windowed
)

# Feature Scaling: Neural networks work best when input values are small.
# We need to reshape the 3D data to 2D to scale it, then reshape back.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

avg_delta = master_df['TimeStamp'].diff().mean()
window_duration = avg_delta * window_size
print(f"Each window covers ~{window_duration:.3f} seconds, average delta: {avg_delta:.9f} seconds")


 Before windowing: Each window covers ~0.003 seconds, average delta: 0.000030933 seconds
Creating time-series windows...
Shape of X_windowed (samples, timesteps, features): (598865, 100, 3)
Shape of y_windowed: (598865,)
Each window covers ~0.003 seconds, average delta: 0.000030933 seconds


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# --- Step 4: Build and Train the LSTM Model ---

print("\nBuilding the LSTM model...")
model = Sequential([
    # The LSTM layer processes the sequence. input_shape is (window_size, num_features)
    LSTM(64, input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])),
    Dropout(0.5), # Dropout helps prevent overfitting
    # The final Dense layer gives a single output (tap or no-tap)
    Dense(1, activation='sigmoid') 
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# To handle the class imbalance, we calculate class weights
# This penalizes the model more for missing the rare 'tap' events
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i : weights[i] for i in range(len(weights))}

print("\nTraining the LSTM model... (This may take a long time)")
# EarlyStopping will stop training if the model isn't improving
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_scaled,
    y_train,
    epochs=15,
    batch_size=256,
    validation_split=0.2, # Use part of the training data for validation
    class_weight=class_weights,
    callbacks=[early_stopping]
)


# --- Step 5: Evaluate the Final Model ---

print("\nEvaluating the final model on the test set...")
# We predict probabilities and use a threshold of 0.5 to get 0s and 1s
y_pred_probs = model.predict(X_test_scaled)
y_pred = (y_pred_probs > 0.5).astype(int)

print("\nFinal LSTM Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

2025-07-29 18:01:20.041751: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-29 18:01:20.182519: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-29 18:01:20.291186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753808480.388660  760272 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753808480.412201  760272 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753808480.614675  760272 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin


Building the LSTM model...


2025-07-29 18:01:22.855920: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)



Training the LSTM model... (This may take a long time)
Epoch 1/15


2025-07-29 18:01:26.133442: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 459927600 exceeds 10% of free system memory.


[1m1497/1498[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 50ms/step - accuracy: 0.5875 - loss: 0.6725

2025-07-29 18:02:43.357013: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 114982800 exceeds 10% of free system memory.


[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 56ms/step - accuracy: 0.5875 - loss: 0.6725 - val_accuracy: 0.6097 - val_loss: 0.6546
Epoch 2/15
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 57ms/step - accuracy: 0.6104 - loss: 0.6542 - val_accuracy: 0.6214 - val_loss: 0.6437
Epoch 3/15
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 60ms/step - accuracy: 0.6220 - loss: 0.6455 - val_accuracy: 0.6347 - val_loss: 0.6349
Epoch 4/15
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 62ms/step - accuracy: 0.6327 - loss: 0.6372 - val_accuracy: 0.6432 - val_loss: 0.6305
Epoch 5/15
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 59ms/step - accuracy: 0.6433 - loss: 0.6295 - val_accuracy: 0.6491 - val_loss: 0.6227
Epoch 6/15
[1m1498/1498[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 60ms/step - accuracy: 0.6468 - loss: 0.6242 - val_accuracy: 0.6574 - val_loss: 0.6148
Epoch 7/15
[1m

2025-07-29 18:24:02.655487: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 143727600 exceeds 10% of free system memory.


[1m3743/3743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 6ms/step

Final LSTM Model Confusion Matrix:
[[45857 18787]
 [18041 37088]]
