In [1]:
data_folder = "quest_training_data/"

In [2]:
#Wanna do a quick check to see if TensorFlow can access the GPU
from tensorflow.python.client import device_lib
import tensorflow as tf

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

print(tf.__version__)

['/device:CPU:0', '/device:GPU:0']
2.10.1


In [3]:
import pandas as pd
import numpy as np
import os
import glob
import re # Using regular expressions for robust sorting
import random
import tensorflow as tf

# --- Seeding for reproducibility ---
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# --- Configuration ---
data_folder = "quest_training_data/"

# --- This list will hold the final, clean DataFrames for each session ---
all_session_dfs = []

print("Starting Smart Data Concatenation (per participant)...")

# --- Discover Participant Folders ---
try:
    participant_folders = [f.path for f in os.scandir(data_folder) if f.is_dir()]
except FileNotFoundError:
    print(f"ERROR: The directory '{data_folder}' was not found. Please check the path.")
    participant_folders = []

print(f"Found {len(participant_folders)} participant folders to process.\n")

# --- Main Processing Loop (Iterating through each participant) ---
for folder_path in participant_folders:
    participant_id = os.path.basename(folder_path)
    print(f"--- Processing Participant: {participant_id} ---")

    search_pattern = os.path.join(folder_path, '*.csv')
    csv_files = glob.glob(search_pattern)

    if not csv_files:
        print("  -> No CSV files found, skipping.\n")
        continue

    # --- CRITICAL: Sort files chronologically using the timestamp in the filename ---
    def get_sort_key(filepath):
        matches = re.findall(r'(\d+\.\d+)\.csv', os.path.basename(filepath))
        return float(matches[-1]) if matches else 0

    csv_files.sort(key=get_sort_key)

    # Concatenate all files FOR THIS PARTICIPANT ONLY
    session_dfs = [pd.read_csv(file, low_memory=False) for file in csv_files]
    if session_dfs:
        session_df = pd.concat(session_dfs, ignore_index=True)
        all_session_dfs.append(session_df)
        print(f"  -> Success! Created a session DataFrame with shape: {session_df.shape}")

print("\n--- Smart Concatenation Complete ---")
print(f"Processed {len(all_session_dfs)} total sessions.")

# --- Feature Engineering (Applied to each session individually) ---
processed_sessions = []
for session_df in all_session_dfs:
    # We need all three axes for feature calculation
    feature_cols = [
        'TimeStamp',
        'Meta_R_Index_Distal_GLOBAL_X',
        'Meta_R_Index_Distal_GLOBAL_Y',
        'Meta_R_Index_Distal_GLOBAL_Z'
    ]
    label_col = 'KeyPressFlag'
    
    required_cols = feature_cols + [label_col]
    if not all(col in session_df.columns for col in required_cols):
        continue # Skip if a session is missing data
        
    processed_df = session_df[required_cols].copy()
    
    #delta_time = processed_df['TimeStamp'].diff()
    delta_time = processed_df['TimeStamp'].diff().replace(0, np.nan).ffill()
    # Calculate Velocity & Acceleration for all axes
    for axis in ['X', 'Y', 'Z']:
        pos_col = f'Meta_R_Index_Distal_GLOBAL_{axis}'
        vel_col = f'vel_{axis.lower()}'
        accel_col = f'accel_{axis.lower()}'
        
        processed_df[vel_col] = processed_df[pos_col].diff() / delta_time
        processed_df[accel_col] = processed_df[vel_col].diff() / delta_time
    
    processed_df.dropna(inplace=True)
    processed_sessions.append(processed_df)

# --- THE FIX IS HERE ---
    # 1. Replace any infinite values (positive or negative) with NaN
    processed_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # 2. NOW drop all NaN values (which now includes the replaced infinities)
    processed_df.dropna(inplace=True)
    
    processed_sessions.append(processed_df)

# --- Final Step: Combine the CLEANED sessions into one master DataFrame for windowing ---
if processed_sessions:
    master_df = pd.concat(processed_sessions, ignore_index=True)
    print("\n--- Feature Engineering Complete ---")
    print("Shape of the final master DataFrame for training:", master_df.shape)
else:
    print("\nNo data to process after feature engineering.")

Starting Smart Data Concatenation (per participant)...
Found 3 participant folders to process.

--- Processing Participant: flap ---
  -> Success! Created a session DataFrame with shape: (469, 1121)
--- Processing Participant: ptx_02_x2 ---
  -> Success! Created a session DataFrame with shape: (276698, 1121)
--- Processing Participant: while ---
  -> Success! Created a session DataFrame with shape: (412, 1121)

--- Smart Concatenation Complete ---
Processed 3 total sessions.

--- Feature Engineering Complete ---
Shape of the final master DataFrame for training: (555146, 11)


Feature Engineering!!!

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# --- Step 1: Prepare Data for Windowing ---
# --- MODIFIED: We now include Y-axis features alongside Z-axis ---
feature_columns = [
    'Meta_R_Index_Distal_GLOBAL_Y', # Position Y
    'vel_y',                        # Velocity Y
    'accel_y',                      # Acceleration Y
    'Meta_R_Index_Distal_GLOBAL_Z', # Position Z
    'vel_z',                        # Velocity Z
    'accel_z'                       # Acceleration Z
]

# The rest of the script adapts automatically to the new feature count
timeseries_data = master_df[['KeyPressFlag'] + feature_columns].to_numpy()

window_size = 100

# --- Step 2: Create Time-Series Windows ---
def make_timeseries_instances(time_series, window_size):
    """Chops the data into overlapping windows."""
    X = []
    y = []
    for i in range(window_size, time_series.shape[0]):
        X.append(time_series[i-window_size:i, 1:])
        y.append(time_series[i, 0])
    return np.array(X), np.array(y).astype(int)

print("Creating time-series windows...")
X_windowed, y_windowed = make_timeseries_instances(timeseries_data, window_size)
print("Shape of X_windowed (samples, timesteps, features):", X_windowed.shape)
print("Shape of y_windowed:", y_windowed.shape)

# --- Step 3: Split and Scale the Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X_windowed, y_windowed, test_size=0.2, random_state=42, stratify=y_windowed
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

X_train_scaled = X_train_scaled.astype('float32')
X_test_scaled = X_test_scaled.astype('float32')

print("\nData successfully prepared with Y and Z-axis features.")

Creating time-series windows...
Shape of X_windowed (samples, timesteps, features): (555046, 100, 6)
Shape of y_windowed: (555046,)

Data successfully prepared with Y and Z-axis features.


Building model

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# --- Step 4: Build and Train the LSTM Model ---

print("\nBuilding the LSTM model...")
model = Sequential([
    # The LSTM layer processes the sequence. input_shape is (window_size, num_features)
    #LSTM(64, input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2]), unroll=False, name="lstm"),
    LSTM(
    64,
    activation='tanh',
    recurrent_activation='sigmoid',
    use_bias=True,
    unroll=False,
    recurrent_dropout=0.0,
    dropout=0.0,  # only the LSTM's internal dropout, not the separate Dropout layer
    input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])
    ),
    #Dropout(0.5), # Dropout helps prevent overfitting
    # The final Dense layer gives a single output (tap or no-tap)
    Dense(1, activation='sigmoid') 
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# To handle the class imbalance, calculate class weights
# This penalizes the model more for missing the rare 'tap' events
from sklearn.utils import class_weight
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i : weights[i] for i in range(len(weights))}

print("\nTraining the LSTM model... (This may take a long time)")
# EarlyStopping will stop training if the model isn't improving
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
tf.debugging.set_log_device_placement(True)


tf.profiler.experimental.start('logdir')
history = model.fit(
    X_train_scaled,
    y_train,
    epochs=10,
    batch_size=4096,
    validation_split=0.2, # Use part of the training data for validation - perhaps 20% is a good start
    class_weight=class_weights,
    callbacks=[early_stopping]
)

tf.profiler.experimental.stop()
# --- Step 5: Evaluate the Final Model ---

print("\nEvaluating the final model on the test set...")
# We predict probabilities and use a threshold of 0.5 to get 0s and 1s
y_pred_probs = model.predict(X_test_scaled)
y_pred = (y_pred_probs > 0.5).astype(int)

print("\nFinal LSTM Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Building the LSTM model...
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 64)                18176     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 18,241
Trainable params: 18,241
Non-trainable params: 0
_________________________________________________________________

Training the LSTM model... (This may take a long time)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Evaluating the final model on the test set...

Final LSTM Model Confusion Matrix:
[[44079 14004]
 [16022 36905]]


Now saving the model and exporting it, before converting to onnx.

In [8]:
model.save('ltsm_tap_6.keras')  # Save the model for later use

In [9]:

# Load trained model
model = tf.keras.models.load_model('ltsm_tap_6.keras')

# Export it to a new directory
model.export('ltsm_tap_6_exported')

AttributeError: 'Sequential' object has no attribute 'export'

In [None]:
# Probably a better way to do this, but this is a quick fix -- unity isn't happy unless i unroll the LSTM layer..
#redunant now, but was needed for my first model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

input_shape = model.input_shape[1:]  
print("Recovered input shape:", input_shape)

new_model = Sequential([
    LSTM(64, input_shape=input_shape, unroll=True, name="lstm"),
    Dropout(0.5, name="dropout"),
    Dense(1, activation='sigmoid', name="dense")
])

new_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# I assume the new model has the same architecture as the old one
# Transfer weights from the old model to the new model
for old_layer, new_layer in zip(model.layers, new_model.layers):
    try:
        new_layer.set_weights(old_layer.get_weights())
        print(f"Transferred weights for {old_layer.name}")
    except ValueError:
        print(f"Skipped {old_layer.name} (shape mismatch)")

In [None]:
export_path = "/data/transient/ahmedszz/Documents/vr_text_entry_models/vr_text_entry/typing_classifier/ltsm_tap_detector_unrolled_25.keras"
new_model.save(export_path)


# Load trained model
model = tf.keras.models.load_model('ltsm_tap_detector_unrolled_25.keras')

# Export it to a new directory
model.export('ltsm_tap_detector_unrolled_25_exported')

In [None]:
# Quick sanity check to see if the model is valid cortesy of chatgpt

import onnx
import onnxruntime as ort
import numpy as np

# 1. Load the ONNX model
onnx_model_path = "/data/transient/ahmedszz/Documents/vr_text_entry_models/vr_text_entry/typing_classifier/ltsm_tap_detector.onnx"
model = onnx.load(onnx_model_path)

# 2. Check model structure
onnx.checker.check_model(model)
print("✅ Model is structurally valid ONNX")

# 3. Create an ONNX Runtime session
session = ort.InferenceSession(onnx_model_path)

# Print model I/O info
print("Inputs:", [(i.name, i.shape, i.type) for i in session.get_inputs()])
print("Outputs:", [(o.name, o.shape, o.type) for o in session.get_outputs()])

# 4. Run a dummy inference
# Example input shape: (1, 100, 3) -> batch of 1, 100 timesteps, 3 features
dummy_input = np.random.rand(1, 100, 3).astype(np.float32)

# Feed into the session
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name

result = session.run([output_name], {input_name: dummy_input})
print("Dummy inference output:", result)

Moving from tap detection to letter detection. First producing a visualisation. 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Make sure master_df is your fully processed dataframe from the previous step

# 1. Create a new dataframe containing ONLY the rows where a real tap occurred
tap_events_df = master_df[master_df['KeyPressFlag'] == 1].copy()

print(f"Found {len(tap_events_df)} tap events to visualize.")

# 2. Create a 2D scatter plot of the tap locations
plt.figure(figsize=(12, 6))
sns.scatterplot(
    data=tap_events_df,
    x='Meta_R_Index_Distal_GLOBAL_X',
    y='Meta_R_Index_Distal_GLOBAL_Z',
    hue='Pressed_Letter', # Color each point by the letter that was pressed
    palette='viridis',
    legend=False # Turning off legend for clarity as there will be many letters
)

plt.title('2D Visualization of All Detected Tap Locations')
plt.xlabel('X Coordinate')
plt.ylabel('Z Coordinate')
plt.grid(True)
plt.axis('equal') # Ensure the scaling of X and Z axes is the same
plt.show()

Visualisation 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

print("--- Generating Model Performance Visualizations ---")

# --- 1. Plot Training & Validation Accuracy and Loss ---
# The 'history' object holds the metrics from each epoch during training
history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# Plotting Accuracy
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs, acc, 'bo-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'ro-', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Plotting Loss
plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'bo-', label='Training Loss')
plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.suptitle('Model Training History', fontsize=16)
plt.show()


# --- 2. Plot a Detailed Confusion Matrix ---
# 'y_test' is the true labels, 'y_pred' is what our model predicted
cm = confusion_matrix(y_test, y_pred)

# For better visualization, we can show percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# The labels for our matrix
labels = ['True Neg (No Tap)', 'False Pos (No Tap)', 'False Neg (Tap)', 'True Pos (Tap)']
counts = [f'{value:0.0f}' for value in cm.flatten()]
percentages = [f'{value:.2%}' for value in cm_percent.flatten()]
final_labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(labels, counts, percentages)]
final_labels = np.asarray(final_labels).reshape(2,2)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=final_labels, fmt='', cmap='Blues', cbar=False)
plt.title('Confusion Matrix for Tap Detection')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()


# --- 3. Print a Detailed Classification Report ---
# This report gives precision, recall, and f1-score for each class
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['No Tap (0)', 'Tap (1)']))
