In [1]:
data_folder = "quest_training_data/eyal_0deg"

In [2]:
import pandas as pd
import numpy as np
import os

def process_quest_file(file_path):
    """
    This function loads a single Quest data file, selects key columns,
    and calculates velocity and acceleration features.
    """
    # Load the entire CSV file. low_memory=False helps prevent data type warnings.
    df = pd.read_csv(file_path, low_memory=False)
    
    # Define the columns we need.
    # Note: Please double-check these column names match your files exactly.
    feature_cols = [
        'TimeStamp',
        'Meta_R_Index_Distal_GLOBAL_X',
        'Meta_R_Index_Distal_GLOBAL_Y',
        'Meta_R_Index_Distal_GLOBAL_Z'
    ]
    label_col = 'KeyPressFlag'
    
    # Select only the necessary columns to save memory
    processed_df = df[feature_cols + [label_col]].copy()
    
    # --- Feature Engineering ---
    
    # Calculate time difference between rows
    delta_time = processed_df['TimeStamp'].diff()
    
    # Calculate Velocity (change in position / change in time)
    processed_df['vel_x'] = processed_df['Meta_R_Index_Distal_GLOBAL_X'].diff() / delta_time
    processed_df['vel_y'] = processed_df['Meta_R_Index_Distal_GLOBAL_Y'].diff() / delta_time
    processed_df['vel_z'] = processed_df['Meta_R_Index_Distal_GLOBAL_Z'].diff() / delta_time
    
    # Calculate Acceleration (change in velocity / change in time)
    processed_df['accel_x'] = processed_df['vel_x'].diff() / delta_time
    processed_df['accel_y'] = processed_df['vel_y'].diff() / delta_time
    processed_df['accel_z'] = processed_df['vel_z'].diff() / delta_time
    
    # Drop rows with NaN values. This will remove the first two rows of each file
    # since acceleration cannot be calculated for them.
    processed_df.dropna(inplace=True)
    
    return processed_df

# --- Main script to process all files in a folder ---



# Get a list of all CSV files in the folder
try:
    all_files = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.csv')]
except FileNotFoundError:
    all_files = []
    print(f"ERROR: The folder '{data_folder}' was not found. Please update the path.")

# A list to hold the processed data from each file
list_of_dfs = []

print(f"Found {len(all_files)} files to process...")

for file in all_files:
    print(f"Processing {file}...")
    try:
        processed_df = process_quest_file(file)
        list_of_dfs.append(processed_df)
    except Exception as e:
        print(f"  -> Could not process {file}. Error: {e}")

# Combine all the processed data into one master DataFrame
if list_of_dfs:
    master_df = pd.concat(list_of_dfs, ignore_index=True)

    print("\n--- Processing Complete ---")
    print("Shape of the final master DataFrame:", master_df.shape)
    
    # Verify the final data
    print("\nData Info:")
    master_df.info()
    
    print("\nClass Distribution ('1' is a Tap):")
    print(master_df['KeyPressFlag'].value_counts(normalize=True))
else:
    print("\nNo files were processed. Please check your data_folder path and file contents.")

Found 1049 files to process...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_as_ _28_134.70.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_as_a_29_134.92.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_as_s_30_135.28.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_dance_ _22_132.79.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_dance_a_24_133.69.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_dance_c_26_134.11.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_dance_d_23_133.38.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_dance_e_27_134.48.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_dance_n_25_133.93.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_jovial_a_4_124.92.csv...
Processing quest_training_data/eyal_0deg\0_Master_ptx_01_30deg_58_jo

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix

# --- Step 1: Prepare Data for Windowing ---

# Define the features you want to use in your model
# We'll use position, velocity, and acceleration for the Z-axis (vertical)
feature_columns = [
    'Meta_R_Index_Distal_GLOBAL_Z',
    'vel_z',
    'accel_z'
]

# Create the final array with the label in the FIRST column
# This is the format the windowing function expects
timeseries_data = master_df[['KeyPressFlag'] + feature_columns].to_numpy()


# --- Step 2: Create Time-Series Windows ---

def make_timeseries_instances(time_series, window_size):
    """Chops the data into overlapping windows."""
    X = []
    y = []
    for i in range(window_size, time_series.shape[0]):
        # The window is the sequence of features from the past
        X.append(time_series[i-window_size:i, 1:])
        # The label is the KeyPressFlag at the end of the window
        y.append(time_series[i, 0])
    return np.array(X), np.array(y).astype(int)

# Define how many past frames the model should see
window_size = 20 

print("Creating time-series windows...")
X_windowed, y_windowed = make_timeseries_instances(timeseries_data, window_size)
print("Shape of X_windowed (samples, timesteps, features):", X_windowed.shape)
print("Shape of y_windowed:", y_windowed.shape)


# --- Step 3: Split and Scale the Data ---

# Stratified split is crucial for imbalanced data
X_train, X_test, y_train, y_test = train_test_split(
    X_windowed, y_windowed, test_size=0.2, random_state=42, stratify=y_windowed
)

# Feature Scaling: Neural networks work best when input values are small.
# We need to reshape the 3D data to 2D to scale it, then reshape back.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)




Creating time-series windows...
Shape of X_windowed (samples, timesteps, features): (110733, 20, 3)
Shape of y_windowed: (110733,)


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# --- Step 4: Build and Train the LSTM Model ---

print("\nBuilding the LSTM model...")
model = Sequential([
    # The LSTM layer processes the sequence. input_shape is (window_size, num_features)
    LSTM(64, input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])),
    Dropout(0.5), # Dropout helps prevent overfitting
    # The final Dense layer gives a single output (tap or no-tap)
    Dense(1, activation='sigmoid') 
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# To handle the class imbalance, we calculate class weights
# This penalizes the model more for missing the rare 'tap' events
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i : weights[i] for i in range(len(weights))}

print("\nTraining the LSTM model... (This may take a long time)")
# EarlyStopping will stop training if the model isn't improving
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_scaled,
    y_train,
    epochs=10,
    batch_size=256,
    validation_split=0.2, # Use part of the training data for validation
    class_weight=class_weights,
    callbacks=[early_stopping]
)


# --- Step 5: Evaluate the Final Model ---

print("\nEvaluating the final model on the test set...")
# We predict probabilities and use a threshold of 0.5 to get 0s and 1s
y_pred_probs = model.predict(X_test_scaled)
y_pred = (y_pred_probs > 0.5).astype(int)

print("\nFinal LSTM Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Building the LSTM model...


  super().__init__(**kwargs)



Training the LSTM model... (This may take a long time)
Epoch 1/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 30ms/step - accuracy: 0.5728 - loss: 0.6713 - val_accuracy: 0.6080 - val_loss: 0.6555
Epoch 2/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - accuracy: 0.6225 - loss: 0.6554 - val_accuracy: 0.6382 - val_loss: 0.6492
Epoch 3/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.6230 - loss: 0.6502 - val_accuracy: 0.6483 - val_loss: 0.6332
Epoch 4/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.6387 - loss: 0.6434 - val_accuracy: 0.6269 - val_loss: 0.6380
Epoch 5/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - accuracy: 0.6363 - loss: 0.6419 - val_accuracy: 0.6673 - val_loss: 0.6177
Epoch 6/10
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step - accuracy: 0.6553 - loss: 0.6332 - val_