In [2]:
# ============================================================
# HUMAN ACTIVITY RECOGNITION - CMPE 287
# Team: SANNS
# Members: Salma Ibrahim, Sana Al Hamimidi, Akmal Shaikh, Nicholas Faylor, Noah Scheuerman
# Date: 11/14/2025
# ============================================================

"""
This project classifies 18 different human activities using smartphone 
sensor data (accelerometer and gyroscope) from the KU-HAR dataset.

We use two machine learning models:
1. Random Forest Classifier
2. Support Vector Machine (SVM)

Dataset: 20,750 samples from 90 participants
Activities: Walking, Running, Sitting, Standing, Jumping, etc.
"""

# ============================================================
# SECTION 1: SETUP & IMPORTS
# ============================================================

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning - Model Selection
from sklearn.model_selection import train_test_split

# Machine Learning - Preprocessing
from sklearn.preprocessing import StandardScaler

# Machine Learning - Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Machine Learning - Evaluation Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Set visualization defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")
print(f"✓ Random state set to: {RANDOM_STATE}")
print(f"✓ NumPy version: {np.__version__}")
print(f"✓ Pandas version: {pd.__version__}")
print("\nReady to load data!")

✓ All libraries imported successfully!
✓ Random state set to: 42
✓ NumPy version: 2.3.4
✓ Pandas version: 2.3.3

Ready to load data!


In [4]:
# ============================================================
# SECTION 2: LOAD DATASET
# ============================================================

# Define activity labels (Class IDs 0-17)
activity_names = {
    0: 'Stand',
    1: 'Sit',
    2: 'Talk-sit',
    3: 'Talk-stand',
    4: 'Stand-sit',
    5: 'Lay',
    6: 'Lay-stand',
    7: 'Pick',
    8: 'Jump',
    9: 'Push-up',
    10: 'Sit-up',
    11: 'Walk',
    12: 'Walk-backward',
    13: 'Walk-circle',
    14: 'Run',
    15: 'Stair-up',
    16: 'Stair-down',
    17: 'Table-tennis'
}

print("Loading KU-HAR Time Domain Subsamples dataset...")
print("=" * 60)

# Load the dataset (no header in CSV)
# NOTE: Update the path to match where you saved the CSV file
df = pd.read_csv('data/time_domain_subsamples.csv', header=None)

print(f"✓ Dataset loaded successfully!")
print(f"\nDataset Shape: {df.shape}")
print(f"  - Total samples: {df.shape[0]:,}")
print(f"  - Total columns: {df.shape[1]:,}")

# Extract labels (column 1800 contains class IDs 0-17)
labels = df.iloc[:, 1800]

print(f"\n{'Activity Distribution:'}")
print("=" * 60)
class_distribution = labels.value_counts().sort_index()

for class_id, count in class_distribution.items():
    activity_name = activity_names[class_id]
    percentage = (count / len(labels)) * 100
    print(f"  {class_id:2d}. {activity_name:15s}: {count:4d} samples ({percentage:5.2f}%)")

print(f"\n{'Dataset Summary:'}")
print("=" * 60)
print(f"  - Sensor readings per sample: 1,800 (300 points × 6 axes)")
print(f"  - Sampling rate: 100 Hz")
print(f"  - Window duration: 3 seconds")
print(f"  - Number of activities: 18")
print(f"  - Total participants: 90")

# Verify data structure
print(f"\n{'Data Structure Verification:'}")
print("=" * 60)
print(f"  - Columns 0-299:      Accelerometer X")
print(f"  - Columns 300-599:    Accelerometer Y")
print(f"  - Columns 600-899:    Accelerometer Z")
print(f"  - Columns 900-1199:   Gyroscope X")
print(f"  - Columns 1200-1499:  Gyroscope Y")
print(f"  - Columns 1500-1799:  Gyroscope Z")
print(f"  - Column 1800:        Class ID (0-17)")
print(f"  - Column 1801:        Data length")
print(f"  - Column 1802:        Serial number")

print("\n✓ Ready for feature extraction!")

Loading KU-HAR Time Domain Subsamples dataset...
✓ Dataset loaded successfully!

Dataset Shape: (20750, 1803)
  - Total samples: 20,750
  - Total columns: 1,803

Activity Distribution:
   0. Stand          : 1886 samples ( 9.09%)
   1. Sit            : 1874 samples ( 9.03%)
   2. Talk-sit       : 1797 samples ( 8.66%)
   3. Talk-stand     : 1866 samples ( 8.99%)
   4. Stand-sit      : 2178 samples (10.50%)
   5. Lay            : 1813 samples ( 8.74%)
   6. Lay-stand      : 1762 samples ( 8.49%)
   7. Pick           : 1333 samples ( 6.42%)
   8. Jump           :  666 samples ( 3.21%)
   9. Push-up        :  480 samples ( 2.31%)
  10. Sit-up         : 1005 samples ( 4.84%)
  11. Walk           :  882 samples ( 4.25%)
  12. Walk-backward  :  317 samples ( 1.53%)
  13. Walk-circle    :  259 samples ( 1.25%)
  14. Run            :  595 samples ( 2.87%)
  15. Stair-up       :  798 samples ( 3.85%)
  16. Stair-down     :  781 samples ( 3.76%)
  17. Table-tennis   :  458 samples ( 2.21%)

Data

In [5]:
# ============================================================
# SECTION 3: FEATURE EXTRACTION
# ============================================================

print("Starting feature extraction...")
print("=" * 60)
print("Converting 1,800 raw sensor values per sample into 24 statistical features")
print()

def extract_features(row):
    """
    Extract statistical features from time-series sensor data.
    
    For each of the 6 sensor axes (Accel X/Y/Z, Gyro X/Y/Z), we extract:
    - Mean: Average value over 3 seconds
    - Std: Standard deviation (variability)
    - Max: Maximum value
    - Min: Minimum value
    
    Total: 6 axes × 4 statistics = 24 features
    """
    features = {}
    
    # Accelerometer X (columns 0-299)
    accel_x = row[0:300].values
    features['accel_x_mean'] = np.mean(accel_x)
    features['accel_x_std'] = np.std(accel_x)
    features['accel_x_max'] = np.max(accel_x)
    features['accel_x_min'] = np.min(accel_x)
    
    # Accelerometer Y (columns 300-599)
    accel_y = row[300:600].values
    features['accel_y_mean'] = np.mean(accel_y)
    features['accel_y_std'] = np.std(accel_y)
    features['accel_y_max'] = np.max(accel_y)
    features['accel_y_min'] = np.min(accel_y)
    
    # Accelerometer Z (columns 600-899)
    accel_z = row[600:900].values
    features['accel_z_mean'] = np.mean(accel_z)
    features['accel_z_std'] = np.std(accel_z)
    features['accel_z_max'] = np.max(accel_z)
    features['accel_z_min'] = np.min(accel_z)
    
    # Gyroscope X (columns 900-1199)
    gyro_x = row[900:1200].values
    features['gyro_x_mean'] = np.mean(gyro_x)
    features['gyro_x_std'] = np.std(gyro_x)
    features['gyro_x_max'] = np.max(gyro_x)
    features['gyro_x_min'] = np.min(gyro_x)
    
    # Gyroscope Y (columns 1200-1499)
    gyro_y = row[1200:1500].values
    features['gyro_y_mean'] = np.mean(gyro_y)
    features['gyro_y_std'] = np.std(gyro_y)
    features['gyro_y_max'] = np.max(gyro_y)
    features['gyro_y_min'] = np.min(gyro_y)
    
    # Gyroscope Z (columns 1500-1799)
    gyro_z = row[1500:1800].values
    features['gyro_z_mean'] = np.mean(gyro_z)
    features['gyro_z_std'] = np.std(gyro_z)
    features['gyro_z_max'] = np.max(gyro_z)
    features['gyro_z_min'] = np.min(gyro_z)
    
    return features

# Extract features for all samples
print("Extracting features from all 20,750 samples...")
print("This may take a minute or two...\n")

feature_list = []
total_samples = len(df)

for idx, row in df.iterrows():
    feature_list.append(extract_features(row))
    
    # Progress update every 5,000 samples
    if (idx + 1) % 5000 == 0:
        progress = ((idx + 1) / total_samples) * 100
        print(f"  Progress: {idx + 1:,}/{total_samples:,} samples ({progress:.1f}%)")

print(f"  Progress: {total_samples:,}/{total_samples:,} samples (100.0%)")

# Create feature DataFrame
features_df = pd.DataFrame(feature_list)

print(f"\n✓ Feature extraction complete!")
print("=" * 60)
print(f"Feature Matrix Shape: {features_df.shape}")
print(f"  - Samples: {features_df.shape[0]:,}")
print(f"  - Features per sample: {features_df.shape[1]}")

print(f"\nExtracted Features:")
print("-" * 60)
for i, col in enumerate(features_df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\nFeature Statistics (first 5 features):")
print("-" * 60)
print(features_df.iloc[:, :5].describe())

print("\n✓ Ready for train-test split!")

Starting feature extraction...
Converting 1,800 raw sensor values per sample into 24 statistical features

Extracting features from all 20,750 samples...
This may take a minute or two...

  Progress: 5,000/20,750 samples (24.1%)
  Progress: 10,000/20,750 samples (48.2%)
  Progress: 15,000/20,750 samples (72.3%)
  Progress: 20,000/20,750 samples (96.4%)
  Progress: 20,750/20,750 samples (100.0%)

✓ Feature extraction complete!
Feature Matrix Shape: (20750, 24)
  - Samples: 20,750
  - Features per sample: 24

Extracted Features:
------------------------------------------------------------
   1. accel_x_mean
   2. accel_x_std
   3. accel_x_max
   4. accel_x_min
   5. accel_y_mean
   6. accel_y_std
   7. accel_y_max
   8. accel_y_min
   9. accel_z_mean
  10. accel_z_std
  11. accel_z_max
  12. accel_z_min
  13. gyro_x_mean
  14. gyro_x_std
  15. gyro_x_max
  16. gyro_x_min
  17. gyro_y_mean
  18. gyro_y_std
  19. gyro_y_max
  20. gyro_y_min
  21. gyro_z_mean
  22. gyro_z_std
  23. gyro_z_m

In [None]:
# ============================================================
# SECTION 4: TRAIN-TEST SPLIT
# ============================================================

print("Splitting data into training and testing sets...")
print("=" * 60)

# Split: 80% training, 20% testing
# Stratify ensures each activity is proportionally represented in both sets
X_train, X_test, y_train, y_test = train_test_split(
    features_df,           # Feature matrix (24 features)
    labels,                # Class labels (0-17)
    test_size=0.2,         # 20% for testing
    stratify=labels,       # Maintain class distribution
    random_state=RANDOM_STATE  # Reproducibility
)

print(f"✓ Data split complete!")
print()

# Display split information
print(f"{'Dataset Split Summary:'}")
print("-" * 60)
print(f"  Training samples:   {len(X_train):,} ({len(X_train)/len(features_df)*100:.1f}%)")
print(f"  Testing samples:    {len(X_test):,} ({len(X_test)/len(features_df)*100:.1f}%)")
print(f"  Total samples:      {len(features_df):,}")
print()
print(f"  Features per sample: {X_train.shape[1]}")

# Verify stratification worked - show class distribution in train/test
print(f"\n{'Class Distribution Verification:'}")
print("=" * 60)
print(f"{'Activity':<20} {'Train Count':<15} {'Test Count':<15} {'Train %':<10} {'Test %'}")
print("-" * 60)

train_dist = y_train.value_counts().sort_index()
test_dist = y_test.value_counts().sort_index()

for class_id in range(18):
    activity = activity_names[class_id]
    train_count = train_dist.get(class_id, 0)
    test_count = test_dist.get(class_id, 0)
    train_pct = (train_count / len(y_train)) * 100
    test_pct = (test_count / len(y_test)) * 100
    
    print(f"{activity:<20} {train_count:<15} {test_count:<15} {train_pct:<10.2f} {test_pct:.2f}")

print()
print("✓ Stratification successful - distributions are proportional!")
print("\n✓ Ready for feature scaling!")


Splitting data into training and testing sets...
✓ Data split complete!

Dataset Split Summary:
------------------------------------------------------------
  Training samples:   16,600 (80.0%)
  Testing samples:    4,150 (20.0%)
  Total samples:      20,750

  Features per sample: 24

Class Distribution Verification:
Activity             Train Count     Test Count      Train %    Test %
------------------------------------------------------------
Stand                1509            377             9.09       9.08
Sit                  1499            375             9.03       9.04
Talk-sit             1438            359             8.66       8.65
Talk-stand           1493            373             8.99       8.99
Stand-sit            1742            436             10.49      10.51
Lay                  1450            363             8.73       8.75
Lay-stand            1410            352             8.49       8.48
Pick                 1066            267             6.42       

''

In [7]:
# ============================================================
# SECTION 5: FEATURE SCALING
# ============================================================

print("Scaling features to standardized range...")
print("=" * 60)
print("Why scaling? SVM performs better when features are on similar scales.")
print("Random Forest doesn't require scaling, but we'll use it for fair comparison.\n")

# Initialize scaler
scaler = StandardScaler()

# Fit on training data only, then transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Feature scaling complete!")
print()

# Show scaling effect on first feature as example
print(f"{'Scaling Effect (example: accel_x_mean):'}")
print("-" * 60)
print(f"Before scaling:")
print(f"  Training mean: {X_train.iloc[:, 0].mean():.4f}")
print(f"  Training std:  {X_train.iloc[:, 0].std():.4f}")
print()
print(f"After scaling:")
print(f"  Training mean: {X_train_scaled[:, 0].mean():.4f}")
print(f"  Training std:  {X_train_scaled[:, 0].std():.4f}")
print()
print("All features now have mean ≈ 0 and standard deviation ≈ 1")

print("\n✓ Ready to train models!")

Scaling features to standardized range...
Why scaling? SVM performs better when features are on similar scales.
Random Forest doesn't require scaling, but we'll use it for fair comparison.

✓ Feature scaling complete!

Scaling Effect (example: accel_x_mean):
------------------------------------------------------------
Before scaling:
  Training mean: -41.6513
  Training std:  4215.4587

After scaling:
  Training mean: 0.0000
  Training std:  1.0000

All features now have mean ≈ 0 and standard deviation ≈ 1

✓ Ready to train models!


In [8]:
# ============================================================
# SECTION 6: TRAIN RANDOM FOREST MODEL
# ============================================================

print("Training Random Forest Classifier...")
print("=" * 60)

# Initialize Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,        # 100 decision trees
    max_depth=None,          # No limit on tree depth
    min_samples_split=2,     # Minimum samples to split a node
    random_state=RANDOM_STATE,
    n_jobs=-1                # Use all CPU cores
)

print(f"Model configuration:")
print(f"  - Number of trees: 100")
print(f"  - Max depth: Unlimited")
print(f"  - Random state: {RANDOM_STATE}")
print()

# Train the model
print("Training in progress...")
import time
start_time = time.time()

rf_model.fit(X_train_scaled, y_train)

training_time = time.time() - start_time
print(f"✓ Training complete! Time: {training_time:.2f} seconds")
print()

# Make predictions
print("Making predictions on test set...")
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(X_test_scaled)

print("✓ Predictions complete!")
print()

# Quick accuracy preview
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("=" * 60)
print(f"RANDOM FOREST - QUICK PREVIEW")
print("=" * 60)
print(f"Test Set Accuracy: {rf_accuracy*100:.2f}%")
print()
print("(Detailed evaluation in next sections)")

print("\n✓ Random Forest model trained successfully!")
print("✓ Ready to train SVM model!")

Training Random Forest Classifier...
Model configuration:
  - Number of trees: 100
  - Max depth: Unlimited
  - Random state: 42

Training in progress...
✓ Training complete! Time: 1.30 seconds

Making predictions on test set...
✓ Predictions complete!

RANDOM FOREST - QUICK PREVIEW
Test Set Accuracy: 93.64%

(Detailed evaluation in next sections)

✓ Random Forest model trained successfully!
✓ Ready to train SVM model!
