# Verify Preprocessed Data

This notebook loads preprocessed tensors and reconstructs DataFrames to verify the structure.

In [7]:
import torch
import pandas as pd
import numpy as np
import json
from pathlib import Path

DATA_DIR = '../DATA/processed'
print("✓ Libraries loaded")

✓ Libraries loaded


## 1. Load Preprocessed Data

In [8]:
# Load tensors
train_data = torch.load(f'{DATA_DIR}/train.pt', weights_only=False)
val_data = torch.load(f'{DATA_DIR}/val.pt', weights_only=False)
test_data = torch.load(f'{DATA_DIR}/test.pt', weights_only=False)

# Load metadata
with open(f'{DATA_DIR}/metadata.json', 'r') as f:
    metadata = json.load(f)

with open(f'{DATA_DIR}/scaler_params.json', 'r') as f:
    scaler_params = json.load(f)

print("="*80)
print("DATA LOADED")
print("="*80)
print(f"\nMetadata: {metadata}")
print(f"\nScaler params: {scaler_params}")
print(f"\nTensor keys: {list(train_data.keys())}")

DATA LOADED

Metadata: {'sequence_length': 500, 'num_train': 13855, 'num_val': 3539, 'num_test': 3581, 'random_seed': 42, 'version': 'streaming', 'formats': ['pt', 'h5']}

Scaler params: {'speed_mean': 11.185626394859764, 'speed_std': 3.090054858423608, 'altitude_mean': 168.22054253312723, 'altitude_std': 489.72540129286784}

Tensor keys: ['speed', 'altitude', 'heart_rate', 'timestamps', 'gender', 'userId', 'original_lengths']


## 2. Check Tensor Shapes

In [9]:
print("TENSOR SHAPES:\n")
for split_name, data in [("Train", train_data), ("Val", val_data), ("Test", test_data)]:
    print(f"{split_name} Set:")
    for key, tensor in data.items():
        print(f"  {key:20s}: {str(tensor.shape):30s} dtype={tensor.dtype}")
    print()

TENSOR SHAPES:

Train Set:
  speed               : torch.Size([13855, 500, 1])    dtype=torch.float32
  altitude            : torch.Size([13855, 500, 1])    dtype=torch.float32
  heart_rate          : torch.Size([13855, 500, 1])    dtype=torch.float32
  timestamps          : torch.Size([13855, 500, 1])    dtype=torch.float32
  gender              : torch.Size([13855, 1])         dtype=torch.float32
  userId              : torch.Size([13855, 1])         dtype=torch.int64
  original_lengths    : torch.Size([13855, 1])         dtype=torch.int64

Val Set:
  speed               : torch.Size([3539, 500, 1])     dtype=torch.float32
  altitude            : torch.Size([3539, 500, 1])     dtype=torch.float32
  heart_rate          : torch.Size([3539, 500, 1])     dtype=torch.float32
  timestamps          : torch.Size([3539, 500, 1])     dtype=torch.float32
  gender              : torch.Size([3539, 1])          dtype=torch.float32
  userId              : torch.Size([3539, 1])          dtype=torch.

## 3. Reconstruct DataFrame from Tensors

In [10]:
def tensors_to_dataframe(data_dict):
    """
    Reconstruct DataFrame from tensor dictionary - EXACTLY what's stored in tensors.
    
    Args:
        data_dict: Dictionary with tensors (speed, altitude, heart_rate, etc.)
    
    Returns:
        DataFrame with one row per workout, showing ONLY tensor data (no computed stats)
    """
    n_samples = len(data_dict['speed'])
    seq_len = data_dict['speed'].shape[1]
    
    records = []
    
    for i in range(n_samples):
        # Extract ONLY what's stored in the tensors
        record = {
            'workout_id': i,
            'userId': int(data_dict['userId'][i, 0]),
            'gender': 'male' if data_dict['gender'][i, 0] == 1.0 else 'female',
            'original_length': int(data_dict['original_lengths'][i, 0]),
            'seq_length': seq_len,
            # Store the actual sequences as lists (for inspection)
            'speed_seq': data_dict['speed'][i, :, 0].numpy().tolist(),
            'altitude_seq': data_dict['altitude'][i, :, 0].numpy().tolist(),
            'heart_rate_seq': data_dict['heart_rate'][i, :, 0].numpy().tolist(),
            'timestamp_seq': data_dict['timestamps'][i, :, 0].numpy().tolist()
        }
        records.append(record)
    
    return pd.DataFrame(records)
# Reconstruct DataFrames - exactly what's in the tensors
train_df = tensors_to_dataframe(train_data)
val_df = tensors_to_dataframe(val_data)
test_df = tensors_to_dataframe(test_data)
print(f"Train DataFrame: {train_df.shape}")
print(f"Val DataFrame:   {val_df.shape}")
print(f"Test DataFrame:  {test_df.shape}")
print(f"\nColumns: {list(train_df.columns)}")

Train DataFrame: (13855, 9)
Val DataFrame:   (3539, 9)
Test DataFrame:  (3581, 9)

Columns: ['workout_id', 'userId', 'gender', 'original_length', 'seq_length', 'speed_seq', 'altitude_seq', 'heart_rate_seq', 'timestamp_seq']


## 4. Inspect Reconstructed DataFrame

In [5]:
print("\nTRAIN DATAFRAME SAMPLE:")
print(train_df.head(10))

print("\n" + "="*80)
print("SUMMARY STATISTICS:")
print("="*80)
print(train_df.describe())


TRAIN DATAFRAME SAMPLE:


   workout_id  userId gender  original_length  seq_length  \
0           0  196571   male              500         500   
1           1  196571   male              500         500   
2           2  196571   male              500         500   
3           3  196571   male              378         500   
4           4  196571   male               83         500   
5           5  196571   male              107         500   
6           6  196571   male              130         500   
7           7  196571   male              168         500   
8           8  196571   male              107         500   
9           9  196571   male              108         500   

                                           speed_seq  \
0  [-2.2363219261169434, -1.8553011417388916, -1....   
1  [-1.8416489362716675, 0.28313133120536804, 0.6...   
2  [-2.2313573360443115, -1.710091233253479, -1.0...   
3  [-1.3253471851348877, -0.6352900266647339, -0....   
4  [-2.1953651905059814, -2.169301986694336, -1.

## 5. Verify Data Quality

In [6]:
print("DATA QUALITY CHECKS:\n")

# Check 1: Gender distribution
print("Gender distribution:")
print(train_df['gender'].value_counts())

# Check 2: Sequence lengths
print(f"\nOriginal lengths: min={train_df['original_length'].min()}, "
      f"max={train_df['original_length'].max()}, median={train_df['original_length'].median():.0f}")

# Check 3: HR range
print(f"\nHeart rate range: {train_df['hr_min'].min():.0f} - {train_df['hr_max'].max():.0f} BPM")

# Check 4: Speed range
print(f"Speed range: {train_df['speed_min'].min():.1f} - {train_df['speed_max'].max():.1f} km/h")

# Check 5: Correlations
print(f"\nSpeed-HR correlation: mean={train_df['speed_hr_corr'].mean():.3f}, "
      f"std={train_df['speed_hr_corr'].std():.3f}")
print(f"Altitude-HR correlation: mean={train_df['altitude_hr_corr'].mean():.3f}, "
      f"std={train_df['altitude_hr_corr'].std():.3f}")

print("\n✓ Data quality verification complete!")

DATA QUALITY CHECKS:

Gender distribution:
gender
male      678
female      1
Name: count, dtype: int64

Original lengths: min=50, max=500, median=477


KeyError: 'hr_min'

## 6. Verify No Data Leakage

In [None]:
# Check that users don't overlap between splits
train_users = set(train_df['userId'].unique())
val_users = set(val_df['userId'].unique())
test_users = set(test_df['userId'].unique())

print(f"Unique users - Train: {len(train_users)}, Val: {len(val_users)}, Test: {len(test_users)}")

train_val_overlap = train_users & val_users
train_test_overlap = train_users & test_users
val_test_overlap = val_users & test_users

if len(train_val_overlap) == 0 and len(train_test_overlap) == 0 and len(val_test_overlap) == 0:
    print("\n✅ NO DATA LEAKAGE: Users are properly split between train/val/test")
else:
    print(f"\n❌ WARNING: Data leakage detected!")
    print(f"  Train-Val overlap: {len(train_val_overlap)} users")
    print(f"  Train-Test overlap: {len(train_test_overlap)} users")
    print(f"  Val-Test overlap: {len(val_test_overlap)} users")

Unique users - Train: 18, Val: 4, Test: 5

✅ NO DATA LEAKAGE: Users are properly split between train/val/test
