In [95]:
# Ensure the repository root is on sys.path so `from src...` works
import sys
from pathlib import Path

# Resolve current path and move up if inside `notebooks`
p = Path().resolve()
if not (p / 'src').exists():
    p = p.parent
sys.path.insert(0, str(p))
print('Added to sys.path:', str(p))


Added to sys.path: C:\Users\rosar\physio-data-pipeline


In [96]:
# Ensure repo root is on sys.path
import sys
from pathlib import Path
p = Path().resolve()
if not (p / 'src').exists():
    p = p.parent
sys.path.insert(0, str(p))

import pandas as pd
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend
import matplotlib.pyplot as plt

from src.pipeline import add_metabolic_columns  # reuse your existing function

In [97]:
# Load the synthetic raw metabolic data
import os
from pathlib import Path

# Navigate to data directory relative to this notebook's location
notebook_dir = Path(os.getcwd())
if notebook_dir.name != 'physio-data-pipeline':
    # If we're in notebooks/, go up to the project root
    notebook_dir = notebook_dir.parent
    
data_path = notebook_dir / "data" / "sample_raw_metabolic_data.csv"
print(f"Looking for data at: {data_path}")
df = pd.read_csv(data_path)

# Add metabolic power (W and W/kg) using your pipeline function
df = add_metabolic_columns(df)

# Focus on running phase only to model running speed
df_run = df[df["phase"] == "run"].copy()

df_run.head()

Looking for data at: c:\Users\rosar\physio-data-pipeline\data\sample_raw_metabolic_data.csv


Unnamed: 0,subject_id,time_s,phase,VO2_ml_min,VCO2_ml_min,body_mass_kg,speed_m_per_s,metabolic_power_W,metabolic_power_W_kg
5,P01,150,run,1887,1642,71.2,3.597991,645.654732,9.068184
6,P01,180,run,1949,1717,71.2,3.597991,668.493168,9.388949
7,P01,210,run,2012,1758,71.2,3.597991,688.982599,9.676722
8,P01,240,run,2080,1809,71.2,3.597991,711.618813,9.994646
9,P01,270,run,2098,1838,71.2,3.597991,718.80699,10.095604


In [98]:
# Scatter plot by participant with color coding
import os
from pathlib import Path

# Get repo root and outputs directory
repo_root = Path(os.getcwd()).resolve()
if repo_root.name != 'physio-data-pipeline':
    repo_root = repo_root.parent
output_dir = repo_root / "outputs"
output_dir.mkdir(exist_ok=True)

fig, ax = plt.subplots(figsize=(12, 7))

# Define colors and participants
participants_list = sorted(df_run["subject_id"].unique())
colors = plt.cm.tab20(np.linspace(0, 1, len(participants_list)))

for idx, participant in enumerate(participants_list):
    p_data = df_run[df_run["subject_id"] == participant]
    ax.scatter(p_data["metabolic_power_W_kg"], p_data["speed_m_per_s"], 
               label=participant, s=80, alpha=0.7, color=colors[idx])

ax.set_xlabel("Metabolic power (W/kg)", fontsize=12, fontweight='bold')
ax.set_ylabel("Speed (m/s)", fontsize=12, fontweight='bold')
ax.set_title("Speed vs Metabolic Power by Participant (Run Phase)", fontsize=13, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, fontsize=9)
ax.grid(True, alpha=0.3)

# Save the plot
plot1_path = output_dir / "speed_vs_metabolic_power_scatter.png"
plt.tight_layout()
plt.savefig(str(plot1_path), dpi=300, bbox_inches='tight')
print(f"✓ Saved scatter plot to: {plot1_path}")
plt.show()

✓ Saved scatter plot to: C:\Users\rosar\physio-data-pipeline\outputs\speed_vs_metabolic_power_scatter.png


  plt.show()


In [99]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Features and target - convert explicitly to numeric NumPy arrays
X = df_run[["metabolic_power_W_kg"]].to_numpy(dtype=float)  # 2D array
y = df_run["speed_m_per_s"].to_numpy(dtype=float)           # 1D array

model = LinearRegression()
model.fit(X, y)

# Predictions and R²
y_pred = model.predict(X)
r2 = model.score(X, y)

coef = model.coef_[0]
intercept = model.intercept_

print(f"Coefficient (slope): {coef:.4f} m/s per W/kg")
print(f"Intercept: {intercept:.4f} m/s")
print(f"R²: {r2:.4f}")

Coefficient (slope): 0.1304 m/s per W/kg
Intercept: 2.0165 m/s
R²: 0.1590


In [100]:
# Sort by X for a clean regression line
import os
from pathlib import Path

# Get repo root and outputs directory
repo_root = Path(os.getcwd()).resolve()
if repo_root.name != 'physio-data-pipeline':
    repo_root = repo_root.parent
output_dir = repo_root / "outputs"
output_dir.mkdir(exist_ok=True)

sort_idx = np.argsort(X[:, 0])
X_sorted = X[sort_idx]
y_pred_sorted = y_pred[sort_idx]

fig, ax = plt.subplots(figsize=(12, 7))

# Define colors and participants
participants_list = sorted(df_run["subject_id"].unique())
colors = plt.cm.tab20(np.linspace(0, 1, len(participants_list)))

# Plot each participant with different color
for idx, participant in enumerate(participants_list):
    p_data = df_run[df_run["subject_id"] == participant]
    ax.scatter(p_data["metabolic_power_W_kg"], p_data["speed_m_per_s"], 
               label=participant, s=80, alpha=0.7, color=colors[idx])

# Add regression line
ax.plot(X_sorted, y_pred_sorted, 'k-', label='Regression line', linewidth=2.5)

ax.set_xlabel("Metabolic power (W/kg)", fontsize=12, fontweight='bold')
ax.set_ylabel("Speed (m/s)", fontsize=12, fontweight='bold')
ax.set_title(f"Linear Regression: Speed ~ Metabolic Power (R² = {r2:.3f})", fontsize=13, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, fontsize=9)
ax.grid(True, alpha=0.3)

# Save the plot
plot2_path = output_dir / "speed_vs_metabolic_power_regression.png"
plt.tight_layout()
plt.savefig(str(plot2_path), dpi=300, bbox_inches='tight')
print(f"✓ Saved regression plot to: {plot2_path}")
plt.show()

✓ Saved regression plot to: C:\Users\rosar\physio-data-pipeline\outputs\speed_vs_metabolic_power_regression.png


  plt.show()


In [101]:
group_summary = df_run.groupby("subject_id").agg(
    mean_power_Wkg=("metabolic_power_W_kg", "mean"),
    mean_speed=("speed_m_per_s", "mean")
).reset_index()

# Save summary to CSV
summary_path = output_dir / "speed_metabolic_power_summary.csv"
group_summary.to_csv(summary_path, index=False)
print(f"Saved summary to: {summary_path}\n")

group_summary

Saved summary to: C:\Users\rosar\physio-data-pipeline\outputs\speed_metabolic_power_summary.csv



Unnamed: 0,subject_id,mean_power_Wkg,mean_speed
0,P01,9.778257,3.597991
1,P02,8.552408,3.388622
2,P03,8.797922,3.39685
3,P04,9.93452,3.794655
4,P05,8.478738,3.115574
5,P06,8.789396,2.679798
6,P07,9.166296,2.856456
7,P08,8.171584,2.886912
8,P09,12.068444,3.323101
9,P10,8.610935,2.921159


In [102]:
# Generate expanded dataset with 15 participants
import os

# Create data for 15 participants with realistic variation
participants_data = []
time_points = [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300]
phases = ['rest'] * 5 + ['run'] * 6  # 5 rest, 6 run

np.random.seed(42)

for pid in range(1, 16):
    subject_id = f"P{pid:02d}"
    
    # Vary body mass between 60-90 kg
    body_mass = np.random.uniform(60, 90)
    
    # Vary resting VO2 (baseline metabolism)
    rest_vo2_base = np.random.uniform(300, 400)
    
    # Vary running speed and corresponding VO2 increase
    run_speed = np.random.uniform(2.5, 4.0)
    run_vo2_base = np.random.uniform(1500, 2200)
    
    for t_idx, time_s in enumerate(time_points):
        phase = phases[t_idx]
        
        if phase == 'rest':
            # Resting phase: small variations around baseline
            vo2 = rest_vo2_base + np.random.uniform(-30, 30)
            vco2 = vo2 * 0.85 + np.random.uniform(-20, 20)
            speed = 0
        else:
            # Running phase: higher VO2, increases over time
            time_into_run = (t_idx - 5) * 30  # seconds into run
            vo2 = run_vo2_base + (time_into_run / 180) * 300 + np.random.uniform(-50, 50)
            vco2 = vo2 * 0.88 + np.random.uniform(-30, 30)
            speed = run_speed
        
        participants_data.append({
            'subject_id': subject_id,
            'time_s': time_s,
            'phase': phase,
            'VO2_ml_min': int(round(vo2)),
            'VCO2_ml_min': int(round(vco2)),
            'body_mass_kg': round(body_mass, 1),
            'speed_m_per_s': speed
        })

# Create DataFrame and save
expanded_df = pd.DataFrame(participants_data)
data_save_path = notebook_dir / "data" / "sample_raw_metabolic_data.csv"
expanded_df.to_csv(data_save_path, index=False)

print(f"✓ Created expanded dataset with {len(expanded_df)} rows")
print(f"✓ Participants: {sorted(expanded_df['subject_id'].unique())}")
print(f"✓ Saved to: {data_save_path}\n")
print("First 15 rows:")
print(expanded_df.head(15))

✓ Created expanded dataset with 165 rows
✓ Participants: ['P01', 'P02', 'P03', 'P04', 'P05', 'P06', 'P07', 'P08', 'P09', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15']
✓ Saved to: c:\Users\rosar\physio-data-pipeline\data\sample_raw_metabolic_data.csv

First 15 rows:
   subject_id  time_s phase  VO2_ml_min  VCO2_ml_min  body_mass_kg  \
0         P01       0  rest         374          305          71.2   
1         P01      30  rest         369          328          71.2   
2         P01      60  rest         401          349          71.2   
3         P01      90  rest         366          330          71.2   
4         P01     120  rest         415          341          71.2   
5         P01     150   run        1887         1642          71.2   
6         P01     180   run        1949         1717          71.2   
7         P01     210   run        2012         1758          71.2   
8         P01     240   run        2080         1809          71.2   
9         P01     270   run        209