In [None]:
import os
import kagglehub

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Load the heart_rate from dataset
df = pd.read_csv('heartrate_seconds_merged.csv', parse_dates=['Time'])

In [None]:
df.describe()

In [None]:
df['Id'].nunique()

In [None]:
# Count number of entries for each Id
id_counts = df['Id'].value_counts()
print(id_counts)

# For Id=4020332650

In [None]:
# Select only 'Time' and 'Value' columns for Id = 4020332650
#first_id = df['Id'].iloc[0]
df_max_id = df[df['Id'] == 4020332650][['Time', 'Value']]
df_max_id.head()

In [None]:
df = df_max_id.set_index('Time')
df.index = pd.to_datetime(df.index)
df = df.sort_index()

In [None]:
df.describe()

In [None]:
# Resample to 1-minute intervals, taking the mean value for each minute
df_min = df.resample('1T').mean(numeric_only=True)

# Check the result
df_min.head()

In [None]:
df_min.plot(style='.',
        figsize=(15, 5),
        color=color_pal[0],
        title='HR of first user over time')
plt.show()

In [None]:
# Fill missing values by interpolation
df_min_filled = df_min['Value'].interpolate()

# Or, to simply drop missing values (may affect regularity)
# df_min_filled = df_min['Value'].dropna()

result = seasonal_decompose(df_min_filled, model='additive', period=60*24)
result.plot()
plt.show()

# Filling missing values with the average value for that minute across all days.

In [None]:
# Make sure 'Time' is the index and is datetime type
df_sorted = df_max_id.set_index('Time')
df_sorted.index = pd.to_datetime(df_sorted.index)

# Resample to 1-minute intervals (creates missing minutes as NaN)
df_min = df_sorted.resample('1T').mean(numeric_only=True)

# Extract minute of day for each timestamp
df_min['minute_of_day'] = df_min.index.hour * 60 + df_min.index.minute

# Compute average value for each minute of the day (across all days)
minute_avg = df_min.groupby('minute_of_day')['Value'].mean()

# Fill missing values with the average for that minute of the day
df_min['Value'] = df_min.apply(
    lambda row: minute_avg[row['minute_of_day']] if pd.isna(row['Value']) else row['Value'],
    axis=1
)

# Now df_min['Value'] has no missing values, and each missing minute is filled with its typical value

In [None]:
df_min['Value'].plot(
    style='.',
    figsize=(15, 5),
    color=color_pal[0],
    title='HR of first user over time'
)
plt.xlabel('Time')
plt.ylabel('Heart Rate')
plt.show()

In [None]:
# Fill missing values by interpolation
df_min_filled = df_min['Value'].interpolate()

# Or, to simply drop missing values (may affect regularity)
# df_min_filled = df_min['Value'].dropna()

result = seasonal_decompose(df_min_filled, model='additive', period=60*12)
result.plot()
plt.show()

In [None]:
df_min_filled.head()

* # Train Test data 

In [None]:
df=df_min_filled.to_frame()
train = df.loc[df.index < '2016-04-10']
test = df.loc[df.index >= '2016-04-10']
plt.clf()
fig, ax = plt.subplots(figsize=(15, 5))
train.plot(ax=ax, label='Training Set', title='Data Train/Test Split')
test.plot(ax=ax, label='Test Set')
ax.axvline('10-04-2015', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()

## **Feature Engineering**

In [None]:
def create_features(df):
    """
    Create time series features and lag features based on time series index.
    """
    df = df.copy()

    # Basic time-based features
    df['minute'] = df.index.minute
    df['hour'] = df.index.hour
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['month'] = df.index.month

    # Lag features
    df['lag_1minute'] = df['Value'].shift(1)  # 1 minute lag
    df['lag_1h'] = df['Value'].shift(60)   # 1 hour lag
    df['lag_1d'] = df['Value'].shift(1440)  # 1 day lag
    df['lag_1w'] = df['Value'].shift(10080) # 1 week lag

    # Rolling statistics features
    df['rolling_mean_30minutes'] = df['Value'].rolling(window=30).mean()  # Last 30 minutes rolling mean
    df['rolling_mean_3hours'] = df['Value'].rolling(window=180).mean()  # Last 3 hours rolling mean
    df['rolling_mean_3days'] = df['Value'].rolling(window=4320).mean()  # Last 3 days rolling mean
    df['rolling_mean_same_hour_last_day'] = df['Value'].shift(1440).rolling(window=30).mean()  # Same hour previous day rolling mean
    df['rolling_mean_same_hour_last_week'] = df['Value'].shift(10080).rolling(window=7).mean()  # Same hour previous week rolling mean

    return df

df = create_features(df_min_filled.to_frame())

In [None]:
df.head

## **Visualization Feature/Target Relationship**

In [None]:
plt.clf()
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='hour', y='Value', ax=ax)
ax.set_title('Heart Rate by Hour')
plt.show()

In [None]:
plt.clf()
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=df, x='dayofweek', y='Value', palette='Blues')
ax.set_title('Heart Rate by Day of Week')
plt.show()

## **Preparing Data For Modelling**

In [None]:
train = create_features(train)
test = create_features(test)


TARGET = 'Value'


FEATURES_XGB = [
    'hour', 'dayofweek', 'month', 'minute', 'day','lag_1minute', 'lag_1h', 'lag_1d', 'lag_1w',
    'rolling_mean_30minutes', 'rolling_mean_3hours', 'rolling_mean_3days', 'rolling_mean_same_hour_last_day', 'rolling_mean_same_hour_last_week'
]

In [None]:
# XGBoost Data
X_train_xgb = train[FEATURES_XGB]
y_train_xgb = train[TARGET]

X_test_xgb = test[FEATURES_XGB]
y_test_xgb = test[TARGET]

# **XGBoost**

In [None]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_xgb, label=y_train_xgb)
dtest = xgb.DMatrix(X_test_xgb, label=y_test_xgb)

# Set parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # Objective function for regression
    'eval_metric': 'rmse',  # Evaluation metric
    'max_depth': 3,  # Depth of the trees
    'learning_rate': 0.01,  # Learning rate
    'colsample_bytree': 0.8,  # Subsample of features
    'subsample': 0.8  # Subsample ratio
}

# Watchlist for monitoring performance on train and test sets
watchlist = [(dtrain, 'train'), (dtest, 'eval')]

# Number of boosting rounds and early stopping
num_round = 1000  # Number of boosting rounds
early_stopping_rounds = 50  # Early stopping

# Train the model
reg = xgb.train(params, dtrain, num_round, watchlist, early_stopping_rounds=early_stopping_rounds,verbose_eval=100)

# # Predict on the test set
y_pred_xgb = reg.predict(dtest)

# # Evaluate the performance (optional, to see RMSE)
# from sklearn.metrics import mean_squared_error
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# print(f"Test RMSE: {rmse}")

In [None]:
# Ensure that 'prediction' column is added only once
if 'prediction' not in df.columns:
    test['prediction'] = reg.predict(dtest)
    df = df.merge(test[['prediction']], how='left', left_index=True, right_index=True)

# Plot the actual and predicted values
ax = df[['Value']].plot(figsize=(15, 5))
df['prediction'].plot(ax=ax, style='.', color='orange')

# Add labels and legend
plt.legend(['Actual Data', 'Predictions'])
ax.set_title('Actual Data vs Predictions')
plt.xlabel('Time')
plt.ylabel('Heart Rate')

# Show the plot
plt.show()

In [None]:
# @title **Evaluating XGBoost**
rmse_xgb = np.sqrt(mean_squared_error(y_test_xgb, y_pred_xgb))
mae_xgb = mean_absolute_error(y_test_xgb, y_pred_xgb)

print(f"XGBoost - RMSE: {rmse_xgb}")
print(f"XGBoost - MAE: {mae_xgb}")

# Making predictoin

In [None]:
# 1. Define the target time
target_time = pd.to_datetime('2016-04-13 01:03:00')

# 2. Create a new row with the correct index
row = pd.DataFrame(index=[target_time])

# 3. Concatenate this row to your df to ensure all lags/rolling can be calculated
df_with_row = pd.concat([df, row])
df_with_row = df_with_row.sort_index()

# 4. Create features for the new row (this will use previous data for lags/rolling)
df_with_row = create_features(df_with_row)

# 5. Select the last row (the one for prediction)
X_row = df_with_row.loc[[target_time], FEATURES_XGB]

# 6. Predict using the trained model
drow = xgb.DMatrix(X_row)
predicted_hr = reg.predict(drow)[0]

print(f"Predicted HR at {target_time}: {predicted_hr:.2f}")

# CNN


In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# 1. Prepare data
X_train = train[FEATURES_XGB].values
y_train = train[TARGET].values
X_test = test[FEATURES_XGB].values
y_test = test[TARGET].values

# 2. Build the model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# 3. Train the model
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)

# 4. Predict
y_pred_nn = model.predict(X_test).flatten()

# 5. Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error
rmse_nn = np.sqrt(mean_squared_error(y_test, y_pred_nn))
mae_nn = mean_absolute_error(y_test, y_pred_nn)
print(f"Neural Network - RMSE: {rmse_nn}")
print(f"Neural Network - MAE: {mae_nn}")

# LSTM

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

FEATURES_XGB = [
    'hour', 'dayofweek', 'month', 'minute', 'day'
]

# 1. Prepare data for LSTM
window_size = 60  # Use last 60 minutes
features = FEATURES_XGB  # Use your feature list

def create_lstm_dataset(df, features, window_size):
    X, y = [], []
    values = df[features].values
    targets = df[TARGET].values
    for i in range(window_size, len(df)):
        X.append(values[i-window_size:i])
        y.append(targets[i])
    return np.array(X), np.array(y)

# Create LSTM datasets for train and test
X_train_lstm, y_train_lstm = create_lstm_dataset(train, features, window_size)
X_test_lstm, y_test_lstm = create_lstm_dataset(test, features, window_size)

# 2. Build the LSTM model
model = keras.Sequential([
    keras.layers.LSTM(units=128, input_shape=(window_size, len(features)), return_sequences=True),
    keras.layers.LSTM(units=64),  # Second LSTM layer
    keras.layers.Dense(units=1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# 3. Train the model
model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, validation_split=0.2)

# 4. Predict
y_pred_lstm = model.predict(X_test_lstm).flatten()

# 5. Evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error
rmse_lstm = np.sqrt(mean_squared_error(y_test_lstm, y_pred_lstm))
mae_lstm = mean_absolute_error(y_test_lstm, y_pred_lstm)
print(f"LSTM - RMSE: {rmse_lstm}")
print(f"LSTM - MAE: {mae_lstm}")

In [None]:
# Check if all timestamps are present
expected_index = pd.date_range(start=target_time - pd.Timedelta(minutes=window_size), 
                              end=target_time - pd.Timedelta(minutes=1), freq='T')
missing = set(expected_index) - set(df.index)
print("Missing timestamps:", missing)

In [None]:
input_seq = df.loc[input_start:input_end, FEATURES_XGB]
print(input_seq.isnull().sum())

In [None]:
# Ensure regular time index
df = df.asfreq('T')  # Fills in missing minutes with NaN
df[FEATURES_XGB] = df[FEATURES_XGB].interpolate()  # Interpolate missing feature values

In [None]:
import pandas as pd
import numpy as np

# 1. Set the target time and window size
target_time = pd.to_datetime('2016-04-13 01:03:00')
window_size = 60  # Should match your LSTM training

# 2. Get the previous window_size rows ending just before the target time
input_end = target_time - pd.Timedelta(minutes=1)
input_start = input_end - pd.Timedelta(minutes=window_size-1)
input_seq = df.loc[input_start:input_end, FEATURES_XGB].values

# 3. Check shape and reshape for LSTM input
if input_seq.shape[0] == window_size:
    input_seq = input_seq.reshape(1, window_size, len(FEATURES_XGB))
    # 4. Predict
    predicted_hr = model.predict(input_seq)[0, 0]
    print(f"LSTM predicted HR at {target_time}: {predicted_hr:.2f}")
else:
    print("Not enough data to create input sequence for prediction.")