In [None]:


file_path = "training_data/915322_sentences_mocap_matched.csv"

print(os.getcwd())

c:\Users\saad_\OneDrive - University of Bath\BSURE\vr_text_entry_project\typing_classifier


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os

ModuleNotFoundError: No module named 'matplotlib'

In [19]:

def process_typing_file(file_path):
    """
    This function takes a file path, loads the data, calculates velocity,
    and returns a cleaned-up DataFrame.
    """
    # Load a sample of the data to keep it manageable
    df = pd.read_csv(file_path, delimiter='\t', skiprows=2, nrows=50000)
    
    # Select the columns we need
    processed_df = df[['time', 'key_symbol', 'Hands_R_I4_x', 'Hands_R_I4_y', 'Hands_R_I4_z']].copy()
    
    # Calculate velocity features
    delta_time = processed_df['time'].diff()
    processed_df['vel_x'] = processed_df['Hands_R_I4_x'].diff() / delta_time
    processed_df['vel_y'] = processed_df['Hands_R_I4_y'].diff() / delta_time
    processed_df['vel_z'] = processed_df['Hands_R_I4_z'].diff() / delta_time
    
    # Create the target label
    processed_df['Is_Tap'] = processed_df['key_symbol'].notna().astype(int)
    
    # Drop rows with NaN values (especially the first row)
    processed_df.dropna(subset=['vel_x', 'vel_y', 'vel_z'], inplace=True)
    
    return processed_df

# --- Now, let's process all files in your 'data' folder ---

data_folder = 'training_data/'
all_files = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.csv')]

# Create a list to hold all the processed DataFrames
list_of_dfs = []

print(f"Found {len(all_files)} files to process...")

for file in all_files:
    print(f"Processing {file}...")
    processed_df = process_typing_file(file)
    list_of_dfs.append(processed_df)

# Combine all the individual DataFrames into one big master DataFrame
master_df = pd.concat(list_of_dfs, ignore_index=True)

print("\nProcessing complete!")
print("Shape of the final master DataFrame:", master_df.shape)
print(master_df.head())

Found 30 files to process...
Processing training_data/005307_sentences_mocap_matched.csv...
Processing training_data/068349_sentences_mocap_matched.csv...
Processing training_data/096115_sentences_mocap_matched.csv...
Processing training_data/106194_sentences_mocap_matched.csv...
Processing training_data/120778_sentences_mocap_matched.csv...


  df = pd.read_csv(file_path, delimiter='\t', skiprows=2, nrows=50000)


Processing training_data/173269_sentences_mocap_matched.csv...
Processing training_data/216367_sentences_mocap_matched.csv...
Processing training_data/221357_sentences_mocap_matched.csv...
Processing training_data/250694_sentences_mocap_matched.csv...
Processing training_data/267678_sentences_mocap_matched.csv...
Processing training_data/270221_sentences_mocap_matched.csv...
Processing training_data/287951_sentences_mocap_matched.csv...
Processing training_data/301816_sentences_mocap_matched.csv...
Processing training_data/364976_sentences_mocap_matched.csv...
Processing training_data/372392_sentences_mocap_matched.csv...
Processing training_data/376930_sentences_mocap_matched.csv...
Processing training_data/379044_sentences_mocap_matched.csv...
Processing training_data/398591_sentences_mocap_matched.csv...
Processing training_data/419655_sentences_mocap_matched.csv...
Processing training_data/431334_sentences_mocap_matched.csv...
Processing training_data/438454_sentences_mocap_matched

In [20]:
# The label (y) is still the 'Is_Tap' column
y = master_df['Is_Tap']

# The features (X) now include the original position AND the new velocities.
# We'll use all three velocity components plus the most important position (Y-axis).
X = master_df[[
    'Hands_R_I4_y',
    'vel_x',
    'vel_y',
    'vel_z'
]]

print("Features (X) and Labels (y) are now defined with velocity.")

Features (X) and Labels (y) are now defined with velocity.


In [21]:
from sklearn.model_selection import train_test_split

# Split the new data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into new training and testing sets.")

Data split into new training and testing sets.


In [22]:
from sklearn.ensemble import RandomForestClassifier

# Create a new instance of the model
velocity_model = RandomForestClassifier(random_state=42)

# Train the new model on the new training data
print("Training the new model with velocity features...")
velocity_model.fit(X_train, y_train)
print("Model training complete!")

Training the new model with velocity features...
Model training complete!


In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Use the new model to make predictions on the test set
y_pred_new = velocity_model.predict(X_test)

# Calculate the new accuracy
accuracy_new = accuracy_score(y_test, y_pred_new)
print(f"New Model Accuracy: {accuracy_new * 100:.2f}%")

# Print the new confusion matrix
print("\nNew Confusion Matrix (with Velocity Features):")
print(confusion_matrix(y_test, y_pred_new))

New Model Accuracy: 98.25%

New Confusion Matrix (with Velocity Features):
[[294743      0]
 [  5251      0]]


In [24]:
# Check the distribution of the 'Is_Tap' column
tap_counts = master_df['Is_Tap'].value_counts()

print("Checking the contents of the master DataFrame:")
print("-" * 40)
print("Number of 'No-Tap' (0) and 'Tap' (1) rows:")
print(tap_counts)
print("-" * 40)

# Also, let's look at it as a percentage
print("Proportion of 'No-Tap' (0) and 'Tap' (1) rows:")
print(master_df['Is_Tap'].value_counts(normalize=True))

Checking the contents of the master DataFrame:
----------------------------------------
Number of 'No-Tap' (0) and 'Tap' (1) rows:
Is_Tap
0    1473673
1      26297
Name: count, dtype: int64
----------------------------------------
Proportion of 'No-Tap' (0) and 'Tap' (1) rows:
Is_Tap
0    0.982468
1    0.017532
Name: proportion, dtype: float64


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. Define your features (X) and labels (y)
y = master_df['Is_Tap']
X = master_df[['Hands_R_I4_y', 'vel_x', 'vel_y', 'vel_z']]

# 2. Split the data using stratification
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

# 3. Create the model with the class_weight fix
# This is the key change
model_balanced = RandomForestClassifier(random_state=42, class_weight='balanced')

# 4. Train the new model
print("Training the balanced model...")
model_balanced.fit(X_train, y_train)
print("Training complete!")

# 5. Evaluate the balanced model
print("-" * 30)
y_pred_balanced = model_balanced.predict(X_test)
accuracy_balanced = accuracy_score(y_test, y_pred_balanced)

print(f"Final Balanced Model Accuracy: {accuracy_balanced * 100:.2f}%")
print("\nFinal Balanced Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_balanced))

Training the balanced model...
Training complete!
------------------------------
Final Balanced Model Accuracy: 98.25%

Final Balanced Confusion Matrix:
[[294734      1]
 [  5259      0]]


In [4]:
# Choose any one of your raw data files
single_file_path = 'quest_training_data/ptx_01/0_Master_ptx_01_0deg_6_boxing_n_13_166.80.csv'

# Load the raw file
df_sample = pd.read_csv(single_file_path, low_memory=False)

# Downsample it just like in your script
df_sample_downsampled = df_sample.iloc[::4, :].copy()

# Calculate the true average time delta on the downsampled data
true_avg_delta = df_sample_downsampled['TimeStamp'].diff().mean()

# Calculate the true window duration
window_size = 100
true_window_duration = true_avg_delta * window_size

print(f"Correct average time between frames (after downsampling): {true_avg_delta:.4f} seconds")
print(f"Correct estimated window duration: {true_window_duration:.2f} seconds")

Correct average time between frames (after downsampling): 0.0222 seconds
Correct estimated window duration: 2.22 seconds


In [1]:
model.save('ltsm_tap_detector')

NameError: name 'model' is not defined