<a href="https://colab.research.google.com/github/stsan9/EndoMondoResearchERSP/blob/master/Time_Interval_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import the necessary libraries
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import pandas as pd
import math
import os
import time
from sklearn.preprocessing import MinMaxScaler
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.layers import Input, Dense, LSTM, Embedding, Dropout, GRU
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from tensorflow.python.keras.models import load_model

TensorFlow 2.x selected.


In [2]:
# Mount the google drive file system
from google.colab import drive
drive.mount('/content/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [0]:
# Load in the data file and store it in a list; data in shared drive
properPath = '/content/gdrive/My Drive/endomondoHR_proper.json' # this may be personalized

data = []
with open(properPath) as f:
    for l in f:
        data.append(eval(l))

In [0]:
# convert to pandas dataframe and drop the unsused columns  
dataframe = pd.DataFrame.from_dict(data)  

In [0]:
# function to extract first element of each list "l"
def begin(l):
  if isinstance(l, list):
      return l[0]

# function to get the mean of only the middle 300 / 500 timestamps in one workout
def mean(l):
  if isinstance(l, list):
    return np.mean(l[100:-100])

In [0]:
# get average heart rate and starting timestamp of all workouts
dataframe['heart_rate'] = dataframe['heart_rate'].apply(mean)
dataframe['timestamp'] = dataframe['timestamp'].apply(begin)

In [0]:
dataframe = dataframe.drop(columns = ["longitude", "altitude", "latitude", "speed", "url", "id", "gender", "sport"])

In [0]:
# filtering out suspicious users based on heart rate
bad_users = dataframe[dataframe['heart_rate'] > 185]
bad_users = dataframe[dataframe['heart_rate'] < 40]
dataframe = dataframe[~dataframe.userId.isin(bad_users['userId'].unique())]

In [0]:
NUM_WORKOUTS = 20
# dataframe now only has users who have more than specified workouts
dataframe = dataframe.groupby("userId").filter(lambda x : len(x) >= NUM_WORKOUTS)

In [0]:
all_users=dataframe["userId"].unique()

In [0]:
for user in all_users:
  user_data = dataframe.loc[dataframe["userId"] == user].sort_values("timestamp")
  arr = np.diff(user_data["timestamp"])
  arr = np.append(arr,[0])
  user_data["timestamp"] = arr.tolist()
  user_data = user_data[0:NUM_WORKOUTS]
  indexNames = dataframe[dataframe["userId"] == user].index
  dataframe.drop(indexNames,inplace=True)
  dataframe = dataframe.append(user_data,ignore_index = True)

dataframe = dataframe.rename( columns= { "timestamp" : "hrs_to_next" } )

In [0]:
def sec_to_hours(secs):
  return math.floor((secs / 60 / 60))

In [0]:
dataframe['hrs_to_next'] = dataframe['hrs_to_next'].apply(sec_to_hours)

In [0]:
# drop users that have > 900 hrs to next workout
bad_users = dataframe.loc[dataframe["hrs_to_next"] > 900]["userId"].unique()
for user in bad_users:
  indexNames = dataframe[dataframe["userId"] == user].index
  dataframe.drop(indexNames, inplace=True)

dataframe = dataframe.reset_index(drop=True)

# drop users that have < 5 hrs to next workout
bad_users = dataframe.loc[dataframe["hrs_to_next"] < 5]["userId"].unique()
for user in bad_users:
  indexNames = dataframe[dataframe["userId"] == user].index
  dataframe.drop(indexNames, inplace=True)

dataframe = dataframe.reset_index(drop=True)

In [0]:
# columns - including all the columns except userId
num_columns = len(dataframe.columns) - 1

In [16]:
# number of unique users
len(dataframe["userId"].unique())

305

In [0]:
# Create an object from the Normalizer class
min_scaler = MinMaxScaler()

In [0]:
# the users in our training data
userids = dataframe['userId'].unique()

In [28]:
dataframe

Unnamed: 0,heart_rate,hrs_to_next,userId
0,152.546667,72,4007546
1,155.236667,24,4007546
2,147.550000,24,4007546
3,153.350000,46,4007546
4,157.343333,71,4007546
...,...,...,...
6095,123.020000,62,1875839
6096,134.740000,97,1875839
6097,121.393333,95,1875839
6098,124.550000,50,1875839


In [0]:
def rolling_window(a, window_size):
    shape = (a.shape[0] - window_size + 1, window_size) + a.shape[1:]
    strides = (a.strides[0],) + a.strides
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

In [20]:
# baseline for the rnn, the mse error if the predicted values is the mean of previous workouts
error_sq = [] # list of error values per user

# adds the squared difference of the average per user
for user in dataframe['userId'].unique():
  user_x = dataframe.loc[dataframe["userId"] == user]

  for i in range(11): # implement sliding window
    avg_hr = np.average(user_x.iloc[0+i: 9+i]['hrs_to_next'])
    error_sq += [(np.log(user_x.iloc[9+i]['hrs_to_next']) - np.log(avg_hr)) ** 2]

dummy_mse = np.average(error_sq) # the final MSE value
print('Baseline MSE for hours: ' + str(dummy_mse))

Baseline MSE for hours: 0.7654247881055768


In [0]:
# build the model (1 LSTM layer and 1 output layer)
model = Sequential()

model.add(GRU(units = 32, return_sequences=False, input_shape = (None, num_columns,)))
model.add(Dropout(0.1))
#model.add(Dense(8, activation = 'relu'))
model.add(Dense(1, activation = 'relu'))

#optimizer = tf.keras.optimizers.RMSprop(lr=1e-3) #low learning rate, could change this as well

model.compile(loss='mean_squared_error', optimizer='adam')  # using mse loss function

In [24]:
epochs = 200
batch_size = 10
num_users = len(userids)
window_size = 10
sequence_length = 20

for e in range(epochs):
  np.random.shuffle(userids)

  # initialize batches
  x_shape = (num_users*10, window_size, num_columns)
  y_shape = (num_users*10, 1)
  x_batch = np.zeros(shape=x_shape, dtype=np.float16)
  y_batch = np.zeros(shape=y_shape, dtype=np.float16)

  for b in range(num_users):
    # grab 20 workouts from the user
    x = dataframe[b * sequence_length : (b + 1) * sequence_length - 1]
    x = x.drop(columns=['heart_rate', "userId"])
    x = x.values

    # creates sliding windows for those workouts
    sliding_window_x = rolling_window(x, window_size)

    for i in range(sliding_window_x.shape[0]):  # put each window into batch
      y = dataframe.loc[[(b+1)*window_size+i]]
      y = y.hrs_to_next
      y = np.log(y, where = y > 0)
      x_batch[sliding_window_x.shape[0]*b+i] = sliding_window_x[i]
      y_batch[sliding_window_x.shape[0]*b+i] = y

  model.fit(x_batch, y_batch, batch_size=batch_size)

Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3050 samples
Train on 3

In [27]:
# predictions vs actual
for b in range(num_users):
  for i in range(11):
    x = dataframe[b * window_size + i : (b + 1) * window_size - 1 + i]
    x = x.drop(columns=['heart_rate', "userId"])
    x = x.values
    test_shape = (1, window_size - 1, num_columns)
    test_input = np.zeros(shape=test_shape, dtype=np.float16)
    test_input[0] = x
    p = model.predict(test_input) # the prediction
    print ("pred : " + str(np.exp(p[0][0])) + " | y: " + str(dataframe.loc[[(b + 1) * window_size - 1 + i]]["hrs_to_next"].iloc(0)[0]))

pred : 58.804913 | y: 26
pred : 64.00369 | y: 29
pred : 65.23304 | y: 39
pred : 60.68793 | y: 97
pred : 60.21818 | y: 25
pred : 64.20127 | y: 45
pred : 63.609837 | y: 72
pred : 56.720005 | y: 24
pred : 63.726234 | y: 70
pred : 61.45412 | y: 72
pred : 61.872528 | y: 26
pred : 61.872528 | y: 26
pred : 62.977345 | y: 56
pred : 61.988525 | y: 111
pred : 61.612682 | y: 168
pred : 57.593 | y: 295
pred : 60.515156 | y: 65
pred : 59.72526 | y: 144
pred : 56.724304 | y: 312
pred : 59.65837 | y: 337
pred : 58.328403 | y: 167
pred : 50.83742 | y: 117
pred : 50.83742 | y: 117
pred : 50.762753 | y: 120
pred : 50.913597 | y: 24
pred : 58.29437 | y: 74
pred : 54.683403 | y: 817
pred : 58.090363 | y: 43
pred : 63.789288 | y: 339
pred : 58.1342 | y: 314
pred : 58.134117 | y: 186
pred : 58.29056 | y: 169
pred : 59.679768 | y: 197
pred : 59.679768 | y: 197
pred : 58.102386 | y: 167
pred : 57.97779 | y: 27
pred : 61.424503 | y: 118
pred : 58.175438 | y: 357
pred : 58.181152 | y: 335
pred : 58.181152 | y: 

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x7ff229e399d8>
Traceback (most recent call last):
  File "/usr/lib/python3.6/weakref.py", line 356, in remove
    def remove(k, selfref=ref(self)):
KeyboardInterrupt


pred : 62.429295 | y: 21
pred : 66.27295 | y: 25
pred : 67.43698 | y: 46
pred : 59.877647 | y: 30
pred : 66.544174 | y: 44
pred : 66.544174 | y: 44
pred : 63.787098 | y: 27
pred : 56.292667 | y: 63
pred : 52.328316 | y: 23
pred : 56.590843 | y: 77
pred : 58.193607 | y: 26
pred : 64.002014 | y: 40
pred : 62.067528 | y: 46
pred : 62.791164 | y: 24
pred : 63.40688 | y: 45
pred : 62.174877 | y: 51
pred : 62.174877 | y: 51
pred : 60.87246 | y: 135
pred : 59.626976 | y: 192
pred : 60.232883 | y: 95
pred : 61.157757 | y: 550
pred : 59.234993 | y: 337
pred : 59.59545 | y: 191
pred : 59.99188 | y: 227
pred : 56.277718 | y: 115
pred : 39.1001 | y: 52
pred : 55.373283 | y: 48
pred : 55.373283 | y: 48
pred : 60.002808 | y: 335
pred : 56.426636 | y: 227
pred : 56.426636 | y: 108
pred : 56.46849 | y: 120
pred : 56.48896 | y: 47
pred : 61.24236 | y: 119
pred : 57.84858 | y: 167
pred : 58.021767 | y: 167
pred : 56.27737 | y: 47
pred : 61.364838 | y: 119
pred : 61.364838 | y: 119
pred : 56.685097 | y: 

KeyboardInterrupt: ignored

Still need to:
- Add callbacks (save the model after training) - SRAVYA 
- Extract a validation set from the current training set - SRAVYA
- Extract a training set and testing set from the current set - ANDRES
- Evaluate the model and experiment with adding back in other contextual variables
- Modify the data and RNN to output a timestamp as well
- Visualize our RNN's predictions