# Access Link:
https://drive.google.com/file/d/1hXFk8uDAmY6dfweGgB6mLwdN8anpSKY5/view?usp=sharing 

# Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Train

## Data Collection

### Kaggle Dataset

In [2]:
# Create empty dataframe
data = pd.DataFrame(columns = ["time",	"Power",	"Oxygen",	"Cadence",	"HR",	"RF",	"subject",	"protocol"])

# For subject 1-8
  # Get data 
  # Add subject number
  # Add method label
  # Append to dataframe
for i in range(1,8):
  if i !=7:
    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_"+ str(i) + "_I.csv")
    df["subject"] = i
    df["protocol"] = "protocol 1"
    data = data.append(df)

    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_" + str(i) + "_II.csv")
    df["subject"] = i
    df["protocol"] = "protocol 2"
    data = data.append(df)

    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_"+ str(i) + "_Wingate.csv")
    df["subject"] = i
    df["protocol"] = "wingate"

    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_"+ str(i) + "_incremental.csv")
    df["subject"] = i
    df["protocol"] = "incremental"
    data = data.append(df)
    
  # There was something weird about subject 7 so I had to do this
  else:
    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_"+ str(i) + "_I.csv")
    df["subject"] = i
    df["protocol"] = "protocol 1"
    data = data.append(df)

    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_" + str(i) + "_II.csv")
    df["subject"] = i
    df["protocol"] = "protocol 2"
    data = data.append(df)

    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_"+ str(i) + "_Wingate.csv")
    df["subject"] = i
    df["protocol"] = "wingate"
    data = data.append(df)

    df = pd.read_csv("https://raw.githubusercontent.com/amaye15/Data/main/data_int/sbj_"+ str(i) + "_incremental.csv")
    df["subject"] = i
    df["protocol"] = "incremental"
    data = data.append(df)

# Rename Columns 
data.rename(columns = {"time":"time", 
                       "Power":"power", 
                       "Oxygen":"oxygen", 
                       "Cadence": "cadence", 
                       "HR": "heart_rate", 
                       "RF": "breathing_rate", 
                       "subject": "subject",
                       "protocol": "method"},
            inplace = True)

# Add dataset ID
data["dataset"] = "kaggle_cycling_vo2"


### University of Malaga Dataset

In [3]:
# Get data
df = pd.read_csv("https://raw.githubusercontent.com/redbackoperations/data-analysis/main/Algorithm%20Prototype%20Development/LSTM%20research/Data/Malaga_Data.csv")

# Clean method label
df["ID_test"] = "method " + df["ID_test"].str.replace(r"\d_", "", regex=True)

# Rename Columns 
df.rename(columns = {"time":"time",
                    "Speed":"speed", 
                    "HR":"heart_rate", 
                    "VO2":"oxygen", 
                    "VCO2": "carbon_dioxide", 
                    "RR":"breathing_rate", 
                    "ID_test":"method", 
                    "ID": "subject",
                    "VE": "pulmonary_ventilation"},
          inplace = True)

# Add dataset ID
df["dataset"] = "university_of_malaga"

# Append to dataframe
data = data.append(df)

# Delete df to avoid ram issues
del df

## Preprocessing

In [4]:
# Keep only the nesscary columns
  # Drop missing values
    # Based on oxygen column
data = data[["subject", "method", "heart_rate", "oxygen"]].dropna(subset = ["oxygen"])

# Pivot heartt rate values 
  # Based on subject & method
x_df = data[["subject", "method", "heart_rate"]].pivot(columns = ["subject", "method"], values = "heart_rate")
# Pivot oxygen values   
  # Based on subject & method
y_df = data[["subject", "method", "oxygen"]].pivot(columns = ["subject", "method"], values = "oxygen")

# For each subject & method
  # Drop missing values
    # x - keep remaining heart rate values
    # y - keep remaining oxygen values
x = [x_df[idx].dropna().values for idx in x_df.columns]
y = [y_df[idx].dropna().values for idx in y_df.columns]

# Delete x_df & y_df to avoid ram issues
del x_df, y_df

"""
Note:
  - This next part is mostly because there were still a mismatch.
  - I tried to look for where and why, but couldn't figure it out.
  - So I just filtered the data again where there was a mismatch or no values.
"""

# For heart rate & oxygen
  # Given a list of values
    # If there is a mismatch based on the number of values or no values ignore
    # Else keep the list of values
x_v2 = [x_list for x_list, y_list in zip(x,y) if (len(x_list) == len(y_list)) or (len(x_list) == 0)]
y_v2 = [y_list for x_list, y_list in zip(x,y) if (len(x_list) == len(y_list)) or (len(x_list) == 0)]

# Generate iime indexes for each list of values
indexes = [np.array(range(len(list_))) for list_ in x_v2]

# Min-Max time indexes
  # 0-1
indexes_scaled = [StandardScaler().fit_transform(idx_array.reshape(-1, 1)).flatten() for idx_array in indexes]

# Min-Max heart rate values
x_v3 = [StandardScaler().fit_transform(x_array.reshape(-1, 1)).flatten() for x_array in x_v2]

# Min-Max oxygen values
y_v3 = [StandardScaler().fit_transform(y_array.reshape(-1, 1)).flatten() for y_array in y_v2]

# Transform into list of lists 
  # Each list contains two values
    # First value Min-Max heart rate
    # Second value Min-Max time index
x_v4 = [[x_val, idx_val] for x_list, idx_list in zip(x_v3, indexes_scaled) for x_val, idx_val in zip(x_list, idx_list)]

# Transform list of lists into one list
y_v4 = [y_val for y_list in y_v3 for y_val in y_list]

tracking_indexes = [idx for idx, y_list in enumerate(y_v3) for y_val in y_list]

## Model Training

In [5]:
# Test-Train Split
  # Indexes split to keep track of subjects
    # Useful for visulisations later
x_train, x_test, y_train, y_test, idx_train, idx_test = train_test_split(x_v4, y_v4, tracking_indexes, shuffle = True, test_size = 0.3, random_state = 42)

# Random Forest
RFR = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

# Train
RFR.fit(x_train, y_train)

# Test
RFR.score(x_test, y_test)

0.9484252664757481

## Grid search - Optional

In [None]:
run_grid_search = False

if run_grid_search:
  parameter_grid = {"max_depth":    [None],
                    "n_estimators": [100],
                    "random_state": [42],
                    "max_features": ["sqrt"],
                    "bootstrap":[True]}

  GS = GridSearchCV(RFR, parameter_grid, verbose = 3)

  GS.fit(x_train, y_train)

  GS.score(x_test, y_test)

## Visualisation - Top Ten

In [24]:
# Top ten longest time series in test data
ten_ts = pd.Series(idx_test).value_counts().index.tolist()[0:10]

# Get time values
time = [i[1] for i in x_test]

# Get Predictions
predicted = RFR.predict(x_test)

# Create Dataframe
x_test_df = pd.DataFrame.from_dict({"idx": idx_test, "time": time, "y": y_test, "y_hat": predicted})

In [43]:
# Define rows & columns
rows = 5
columns = 2

# Define outilne of subplots
fig = make_subplots(rows = rows, 
                    cols = columns, 
                    subplot_titles = [f"Subject {subject}" for subject in ten_ts])

# Index for subjects
subject_idx = 0

# Loop over rows
  # Loop over columns
for idx_1 in range(1, rows + 1):
  for idx_2 in range(1, columns + 1):
    # If first iteration of the two loops
      # Set show legend to true
        # If not false
    show_legend = False
    if idx_1 == 1 & idx_2 == 1:
      show_legend = True
    subject_df = x_test_df[x_test_df["idx"] == subject_idx].sort_values("time").copy()
    fig.add_trace(go.Scatter(x = subject_df.time.values,
                             y = subject_df.y.values, 
                             name = 'Oxygen',
                             line = dict(color='blue'),
                             legendgroup = 'Oxygen',
                             showlegend = show_legend), 
                  row=idx_1, 
                  col=idx_2)
    fig.add_trace(go.Scatter(x = subject_df.time.values,
                          y = subject_df.y_hat.values, 
                          name = 'Predicted Oxygen',
                          line = dict(color='red'),
                          legendgroup = 'Predicted Oxygen',
                          showlegend = show_legend), 
              row=idx_1, 
              col=idx_2)
    
    # Move on to the next subject
    subject_idx += 1

fig.update_layout(height = 600,
                  width = 1200,
                  title_text = f"Oxygen Consumption: R2 = {round(RFR.score(x_test, y_test), 4)}",
                  title_x = 0.5)
fig.show()

## Visualisation - Random Forest (Not Working)

In [18]:
import graphviz 
from sklearn import tree
dot_data = tree.export_graphviz(RFR.estimators_[0], 
                                out_file=None, 
                                feature_names=["heart_rate", "time"], 
                                class_names=None, 
                                filled=True, 
                                rounded=True, 
                                special_characters=True)  
#graph = graphviz.Source(dot_data)  
from IPython import display
display.display(dot_data)

# Production (to be completed)