# Data Analysis
This notebook is used to analyze the data collected from live runs. It includes various visualizations and statistical analyses to understand the performance of the runner

## TOC:

In [60]:
from utilis.helper import extract_global_json, extract_json
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.api import OLS, add_constant
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

## Try running time series cross-validation with external regressors

In [62]:
def external_factor_importance(external_parameters: list, constant_parameters: list, target_columns: list):

    output_folder = extract_global_json('output_folder')
    # Prepare features including external factors
    all_features = []
    all_targets = []
    
    # loop through each dataset
    for folder_name in os.listdir(output_folder):
        csv_file_path = os.path.join(output_folder, folder_name, f"{folder_name}_streams.csv")
        json_file_path = os.path.join(output_folder, folder_name, f"{folder_name}_overall.json")
        # get the csv_file into a DataFrame
        df = pd.read_csv(csv_file_path)
        overall_data = extract_json(json_file_path)

        # remove rows with NaN values in the target column
        df = df.dropna()
        features = df[external_parameters].values
        targets = df[target_columns].values

        # Flatten weather dict if present
        flat = overall_data.copy()
        if "weather" in flat and isinstance(flat["weather"], dict):
            for k, v in flat["weather"].items():
                flat[f"{k}"] = v
            del flat["weather"]

        external_factors = np.tile(
            [flat[param] for param in constant_parameters],
            (len(df), 1)
        )
        # external_factors = np.full((len(df), 2), [overall_data['weather']['temp'], overall_data['weather']['humidity']])
        
        all_features.append(np.hstack([features, external_factors]))
        all_targets.append(targets)
    
    x = np.vstack(all_features)
    y = np.vstack(all_targets)
    # y = np.hstack(all_targets)
    
    # Feature names
    feature_names = external_parameters + constant_parameters

    # make them into a pandas DataFrame
    df_x = pd.DataFrame(x, columns=feature_names)
    df_y = pd.DataFrame(y, columns=target_columns)


    return df_x, df_y

df_x, df_y = external_factor_importance(
    external_parameters=["distance_m", "pace_efficiency", "heartrate_bpm", "velocity_mps", "altitude_m", "diff_altitude_m", "headwind_mps", "crosswind_mps", "grade_percent"],
    constant_parameters=["temp", "humidity", "pressure", "uvindex", "calories"],
    # target_column=["diff_heartrate_bpm"])
    # target_column=["diff_heartrate_shift_bpm"])
    # target_column=["acceleration_mps2"])
    # target_column=["acceleration_shift_mps2"])
    target_columns=["diff_heartrate_bpm", "acceleration_mps2"])


### Random Forest Regressor

In [63]:
# Create multi-output regressor
rf_multi = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
rf_multi.fit(df_x, df_y)

# Extract feature importance for each target
importance_results = {}

for i, target in enumerate(df_y.columns):
    importance_df = pd.DataFrame({
        'feature': df_x.columns,
        'importance': rf_multi.estimators_[i].feature_importances_
    }).sort_values('importance', ascending=False)
    
    importance_results[target] = importance_df
    
    print(f"\nFeature importance for {target}:")
    print(importance_df.head(10))


Feature importance for diff_heartrate_bpm:
            feature  importance
0        distance_m    0.276095
3      velocity_mps    0.204026
2     heartrate_bpm    0.155571
1   pace_efficiency    0.071164
7     crosswind_mps    0.065932
6      headwind_mps    0.058138
8     grade_percent    0.055910
4        altitude_m    0.048129
5   diff_altitude_m    0.033502
10         humidity    0.008851

Feature importance for acceleration_mps2:
            feature  importance
3      velocity_mps    0.197300
0        distance_m    0.132961
1   pace_efficiency    0.120041
6      headwind_mps    0.106561
2     heartrate_bpm    0.096597
7     crosswind_mps    0.091653
8     grade_percent    0.085426
4        altitude_m    0.067166
5   diff_altitude_m    0.053421
13         calories    0.015261


### Multivariate Regression

In [66]:
for i, target in enumerate(df_y.columns):
    print(f"{'='*10} OLS Regression Analysis for: {target} {'='*10} \n")
    
    # # Extract target variable
    # y_target = y[:, i]
    y = df_y[target]
    # Add constant for intercept
    df_x_const = add_constant(df_x)
    # feature_names_with_const = ['const'] + feature_names
    
    # Fit OLS model
    model = OLS(y, df_x_const).fit()

    # Print summary
    print(model.summary())


                            OLS Regression Results                            
Dep. Variable:     diff_heartrate_bpm   R-squared:                       0.328
Model:                            OLS   Adj. R-squared:                  0.327
Method:                 Least Squares   F-statistic:                     228.3
Date:                Fri, 08 Aug 2025   Prob (F-statistic):               0.00
Time:                        11:39:26   Log-Likelihood:                -5010.6
No. Observations:                6557   AIC:                         1.005e+04
Df Residuals:                    6542   BIC:                         1.015e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -29.1256      4.729    