<a href="https://colab.research.google.com/github/ssuppe/bloogsugarml/blob/main/1a)_Blood_Sugar_ML_Investigation_Linear_Regression_Model_adding_IOB_stats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary

* Part one of the Google Machine Learning Crash Course, but applied to my own blood sugar data (for a practical exercise)
* This is similar to the original (1) colab, but I've also added IOB (insulin-on-board) data

# Findings
* Adding in past IOBs seems to /slightly/ improve the model (from 29 to ~26, but now a learning rate 100x smaller (.0001) and 1000 epochs.
* This is all theoretical:
  1. I am training/evaluating on the same data. I am likely overfitting.
  2. The difference between 29 vs 26 is minor and likely not statsig
  3. Again, my range is very tight for this date range (90% TIR) with low variance, so this isn't very different probably from random selection ~60-90
  minutes later.



In [1]:
import json
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Data prep

In [84]:
# First, basic data ingestion and date parsing
with open('/content/drive/MyDrive/src/blood_sugar_ml/nightscout_entries_21-26 July2024.json', "r") as f:
  js = json.load(f)

data = []
# From JSON, create dataset
# display(js[0])
for j in js:
  # display(j['date']['$numberLong'])
  data.append([j['date']['$numberLong'], j['utcOffset'],j['sgv'],j['direction'] ])

data = pd.DataFrame(data)
data.columns = ["timestamp", "utcOffset", "sgv", "direction"]
data['date'] = pd.to_datetime(data['timestamp'],unit='ms')
# Get rid of seconds precision, and round to nearest 5 minutes
data['date'] = data['date'].dt.floor('min').dt.round("5min")
data.head()

  data['date'] = pd.to_datetime(data['timestamp'],unit='ms')


Unnamed: 0,timestamp,utcOffset,sgv,direction,date
0,1721556338000,60,101,FortyFiveUp,2024-07-21 10:05:00
1,1721556637000,60,110,FortyFiveUp,2024-07-21 10:10:00
2,1721556938000,60,118,FortyFiveUp,2024-07-21 10:15:00
3,1721557237000,60,125,FortyFiveUp,2024-07-21 10:20:00
4,1721557537000,60,127,FortyFiveUp,2024-07-21 10:25:00


In [None]:
# Useful for inspection, but slow
#data.to_csv('/content/drive/MyDrive/src/blood_sugar_ml/nightscout_entries_21-26 July2024.csv')

In [85]:
# Device status entries have IOB data
# First, basic data ingestion and date parsing
with open('/content/drive/MyDrive/src/blood_sugar_ml/nightscout_devicestatus_21-26 July2024.json', "r") as f:
  js = json.load(f)

dsdata = []
# From JSON, create dataset
# display(js[0])
for j in js:
  # display(j)
  # break
  if 'openaps' not in j:
    continue

  iob = None
  if 'iob' in j['openaps']:
    iob = j['openaps']['iob']['iob']*10

    basaliob = None
    if 'basaliob' in j['openaps']['iob']:
      basaliob = j['openaps']['iob']['basaliob']*100

    activity = None
    if 'activity' in j['openaps']['iob']:
      activity = j['openaps']['iob']['activity']*1000
    dsdata.append([j['date']['$numberLong'], iob, basaliob, activity ])

dsdata = pd.DataFrame(dsdata)
dsdata.columns = ["timestamp", "iob", "basaliob", "activity"]
dsdata['date'] = pd.to_datetime(dsdata['timestamp'],unit='ms')
# Get rid of seconds precision, and round to nearest 5 minutes
dsdata['date'] = dsdata['date'].dt.floor('min').dt.round("5min")
# dsdata[dsdata['date'] > '2024-07-21 10'].head()
display(dsdata.head())
dsdata.describe()


  dsdata['date'] = pd.to_datetime(dsdata['timestamp'],unit='ms')


Unnamed: 0,timestamp,iob,basaliob,activity,date
0,1721520075226,27.91,-88.4,15.5,2024-07-21 00:00:00
1,1721520375910,26.31,-90.0,16.2,2024-07-21 00:05:00
2,1721520965925,23.71,-85.4,16.9,2024-07-21 00:15:00
3,1721520979687,23.83,-86.3,16.9,2024-07-21 00:15:00
4,1721521000349,23.76,-85.8,17.0,2024-07-21 00:15:00


Unnamed: 0,iob,basaliob,activity,date
count,1929.0,1929.0,1929.0,1929
mean,15.027797,-139.745101,11.799533,2024-07-23 23:55:41.990668544
min,-6.31,-333.5,-2.4,2024-07-21 00:00:00
25%,2.75,-231.3,3.0,2024-07-22 11:40:00
50%,9.63,-160.8,9.7,2024-07-24 00:30:00
75%,24.58,-39.9,18.7,2024-07-25 12:00:00
max,74.76,84.3,42.4,2024-07-26 23:55:00
std,16.001815,110.462717,10.162545,


In [86]:
#@title Merge dataframes

display(len(data))
display(len(dsdata))
data = pd.merge(data, dsdata, left_on='date', right_on='date', how='inner')


1416

1929

In [87]:
del data['timestamp_x']
del data['timestamp_y']

In [88]:
# Next: creation of features
# Previous absolute values
data["hour"] = data["date"].apply(lambda x: x.hour)

# This code triggers a noise performance warning. I should probably fix it but it's not
# a problem yet so suppressing
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
sgv_features = []
delta = 10
for min in range(60,181,delta):
  data[f"sgv-{min}"] = data["sgv"].shift(int(min/5))
  # According to MLCC, the scale of the parameters should all be roughly the same.
  # Since the delta is a %, I will multiple by 1000 (so, for example, .05 becomes 50)
  data[f"sgv-{min}delta"] = ((data["sgv"].shift(int(min/5)) - data["sgv"].shift(int(min/5)+delta))/data["sgv"].shift(int(min/5)))*1000
  data[f"iob-{min}"] = data["iob"].shift(int(min/5))
  data[f"basaliob-{min}"] = data["basaliob"].shift(int(min/5))
  data[f"activity-{min}"] = data["activity"].shift(int(min/5))

  sgv_features.append(f"sgv-{min}")
  sgv_features.append(f"sgv-{min}delta")
  sgv_features.append(f"iob-{min}")
  sgv_features.append(f"basaliob-{min}")
  sgv_features.append(f"activity-{min}")
data = data.dropna()
data = data.copy()
display(data.head(10))



Unnamed: 0,utcOffset,sgv,direction,date,iob,basaliob,activity,hour,sgv-60,sgv-60delta,...,sgv-170,sgv-170delta,iob-170,basaliob-170,activity-170,sgv-180,sgv-180delta,iob-180,basaliob-180,activity-180
46,60,98,Flat,2024-07-21 13:05:00,34.47,-271.9,30.9,13,130.0,-346.153846,...,180.0,388.888889,53.99,-186.2,25.4,171.0,409.356725,47.02,-174.6,23.1
47,60,89,Flat,2024-07-21 13:10:00,31.39,-275.0,30.2,13,120.0,-458.333333,...,184.0,358.695652,51.49,-196.6,27.0,180.0,438.888889,50.2,-186.9,25.1
48,60,81,FortyFiveDown,2024-07-21 13:15:00,28.75,-280.6,29.6,13,107.0,-635.514019,...,185.0,324.324324,47.77,-212.1,28.4,180.0,388.888889,53.99,-186.2,25.4
49,60,82,Flat,2024-07-21 13:25:00,25.26,-284.8,28.4,13,107.0,-598.130841,...,185.0,313.513514,47.89,-212.4,28.4,184.0,358.695652,51.49,-196.6,27.0
50,60,84,Flat,2024-07-21 13:30:00,22.06,-275.5,26.5,13,94.0,-744.680851,...,183.0,306.010929,45.32,-219.4,29.2,185.0,324.324324,47.77,-212.1,28.4
51,60,84,Flat,2024-07-21 13:30:00,21.84,-273.7,26.3,13,83.0,-975.903614,...,183.0,289.617486,44.82,-222.2,29.3,185.0,313.513514,47.89,-212.4,28.4
52,60,86,Flat,2024-07-21 13:35:00,19.33,-274.5,25.1,13,77.0,-1025.974026,...,179.0,234.636872,42.34,-229.6,29.7,183.0,306.010929,45.32,-219.4,29.2
53,60,87,Flat,2024-07-21 13:40:00,17.32,-274.6,23.9,13,76.0,-973.684211,...,173.0,109.82659,39.35,-239.1,29.9,183.0,289.617486,44.82,-222.2,29.3
54,60,88,Flat,2024-07-21 13:45:00,14.54,-276.1,22.5,13,80.0,-800.0,...,171.0,0.0,36.33,-248.0,29.8,179.0,234.636872,42.34,-229.6,29.7
55,60,88,Flat,2024-07-21 13:50:00,12.78,-277.6,21.4,13,91.0,-505.494505,...,171.0,-52.631579,34.55,-244.4,29.5,173.0,109.82659,39.35,-239.1,29.9


# MLCC Linear regression exercise
Taken from [here](https://colab.corp.google.com/google_src/files/head/depot/google3/engedu/ml/mlcc/next-gen-colabs/linear-regression-exercise.ipynb?mlpp=0)

In [89]:
import io

# data
import numpy as np
import pandas as pd

# machine learning
import keras

# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

In [90]:
# Describe the data and get to know it
data.describe(include='all')

Unnamed: 0,utcOffset,sgv,direction,date,iob,basaliob,activity,hour,sgv-60,sgv-60delta,...,sgv-170,sgv-170delta,iob-170,basaliob-170,activity-170,sgv-180,sgv-180delta,iob-180,basaliob-180,activity-180
count,1547.0,1547.0,1547,1547,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,...,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0
unique,,,8,,,,,,,,...,,,,,,,,,,
top,,,Flat,,,,,,,,...,,,,,,,,,,
freq,,,1303,,,,,,,,...,,,,,,,,,,
mean,60.0,112.02521,,2024-07-23 23:48:06.748545280,14.978694,-141.614609,11.737104,11.324499,111.97479,-37.73934,...,113.020685,-34.155786,15.229942,-144.867227,12.015255,113.094376,-33.492936,15.289405,-145.134648,12.044279
min,60.0,40.0,,2024-07-21 13:05:00,-6.31,-333.5,-2.4,0.0,40.0,-2358.490566,...,40.0,-2358.490566,-6.31,-333.5,-2.4,40.0,-2358.490566,-6.31,-333.5,-2.4
25%,60.0,93.0,,2024-07-22 18:02:30,2.275,-228.1,2.5,5.0,93.0,-155.538755,...,94.0,-148.863636,2.31,-233.1,2.7,94.0,-148.863636,2.345,-233.1,2.7
50%,60.0,110.0,,2024-07-24 01:00:00,9.32,-164.5,9.8,11.0,110.0,0.0,...,111.0,0.0,9.63,-170.3,10.0,111.0,0.0,9.67,-170.9,10.1
75%,60.0,130.0,,2024-07-25 05:02:30,24.95,-46.15,19.1,17.0,130.0,130.53684,...,130.0,133.333333,25.175,-52.1,19.7,130.0,133.595801,25.33,-52.45,19.75
max,60.0,216.0,,2024-07-26 10:00:00,74.76,84.3,42.4,23.0,216.0,701.492537,...,216.0,701.492537,74.76,84.3,42.4,216.0,701.492537,74.76,84.3,42.4


In [91]:
# Correlation matrix
# First, create a df matrix of the features I want to use
numeric_values = ["sgv", "hour"] + sgv_features
df = data[numeric_values]
df.corr(numeric_only = True)

Unnamed: 0,sgv,hour,sgv-60,sgv-60delta,iob-60,basaliob-60,activity-60,sgv-70,sgv-70delta,iob-70,...,sgv-170,sgv-170delta,iob-170,basaliob-170,activity-170,sgv-180,sgv-180delta,iob-180,basaliob-180,activity-180
sgv,1.000000,-0.033630,0.367515,0.305680,0.147423,0.179290,0.088006,0.280569,0.268259,0.134028,...,0.016946,-0.053732,-0.031568,0.323647,-0.123177,0.032297,-0.031799,-0.047676,0.326230,-0.140859
hour,-0.033630,1.000000,-0.027271,-0.043673,0.401109,-0.626115,0.484367,-0.024510,-0.039568,0.409818,...,-0.036587,-0.003669,0.335752,-0.516363,0.306195,-0.043439,-0.004777,0.316309,-0.503918,0.283449
sgv-60,0.367515,-0.027271,1.000000,0.501848,0.301843,0.136827,0.230952,0.949865,0.545467,0.273484,...,0.039969,0.049794,0.058763,0.275343,-0.025820,0.003295,-0.014515,0.038213,0.289204,-0.044735
sgv-60delta,0.305680,-0.043673,0.501848,1.000000,0.011508,0.128529,-0.120491,0.344580,0.872751,-0.047572,...,-0.292372,-0.227882,-0.091588,0.095651,-0.102896,-0.253298,-0.250696,-0.095125,0.091249,-0.098367
iob-60,0.147423,0.401109,0.301843,0.011508,1.000000,-0.352249,0.820428,0.318647,0.060403,0.928752,...,0.108163,0.043797,0.202335,-0.100148,0.139044,0.088278,0.018281,0.168498,-0.088945,0.117622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sgv-180,0.032297,-0.043439,0.003295,-0.253298,0.088278,-0.099151,0.211187,0.043194,-0.293349,0.108317,...,0.952887,0.339274,0.357746,0.095392,0.303464,1.000000,0.494697,0.346185,0.111878,0.273431
sgv-180delta,-0.031799,-0.004777,-0.014515,-0.250696,0.018281,0.062644,0.057542,0.051733,-0.228875,0.044537,...,0.539314,0.873152,0.084610,0.139636,-0.072974,0.494697,1.000000,0.034774,0.133979,-0.110399
iob-180,-0.047676,0.316309,0.038213,-0.095125,0.168498,-0.611746,0.485223,0.060530,-0.093111,0.203691,...,0.316464,-0.031017,0.932735,-0.410409,0.892076,0.346185,0.034774,1.000000,-0.362295,0.825825
basaliob-180,0.326230,-0.503918,0.289204,0.091249,-0.088945,0.761869,-0.221331,0.277045,0.096026,-0.102586,...,0.122680,0.119679,-0.323320,0.994390,-0.571530,0.111878,0.133979,-0.362295,1.000000,-0.608011


Here are some observation
* BasalIOB seems highest (-180 is .32), and somewhat high (activity-180 is .14)

In [None]:
# Pairplot
sns.pairplot(df, x_vars=numeric_values, y_vars=numeric_values)

# Part 3 - Train Model


In [82]:
#@title Define plotting functions

def make_plots(df, feature_names, label_name, model_output, sample_size=200):

  random_sample = df.sample(n=sample_size).copy()
  random_sample.reset_index()
  weights, bias, epochs, rmse = model_output

  is_2d_plot = len(feature_names) == 1
  model_plot_type = "scatter" if is_2d_plot else "surface"
  fig = make_subplots(rows=1, cols=2,
                      subplot_titles=("Loss Curve", "Model Plot"),
                      specs=[[{"type": "scatter"}, {"type": model_plot_type}]])

  plot_data(random_sample, feature_names, label_name, fig)
  plot_model(random_sample, feature_names, weights, bias, fig)
  plot_loss_curve(epochs, rmse, fig)

  fig.show()
  return

def plot_loss_curve(epochs, rmse, fig):
  curve = px.line(x=epochs, y=rmse)
  curve.update_traces(line_color='#ff0000', line_width=3)

  fig.append_trace(curve.data[0], row=1, col=1)
  fig.update_xaxes(title_text="Epoch", row=1, col=1)
  fig.update_yaxes(title_text="Root Mean Squared Error", row=1, col=1, range=[rmse.min()*0.8, rmse.max()])

  return

def plot_data(df, features, label, fig):
  if len(features) == 1:
    scatter = px.scatter(df, x=features[0], y=label)
  else:
    scatter = px.scatter_3d(df, x=features[0], y=features[1], z=label)

  fig.append_trace(scatter.data[0], row=1, col=2)
  if len(features) == 1:
    fig.update_xaxes(title_text=features[0], row=1, col=2)
    fig.update_yaxes(title_text=label, row=1, col=2)
  else:
    fig.update_layout(scene1=dict(xaxis_title=features[0], yaxis_title=features[1], zaxis_title=label))

  return

def plot_model(df, features, weights, bias, fig):
  df['FARE_PREDICTED'] = bias[0]

  for index, feature in enumerate(features):
    df['FARE_PREDICTED'] = df['FARE_PREDICTED'] + weights[index][0] * df[feature]

  if len(features) == 1:
    model = px.line(df, x=features[0], y='FARE_PREDICTED')
    model.update_traces(line_color='#ff0000', line_width=3)
  else:
    z_name, y_name = "FARE_PREDICTED", features[1]
    z = [df[z_name].min(), (df[z_name].max() - df[z_name].min()) / 2, df[z_name].max()]
    y = [df[y_name].min(), (df[y_name].max() - df[y_name].min()) / 2, df[y_name].max()]
    x = []
    for i in range(len(y)):
      x.append((z[i] - weights[1][0] * y[i] - bias[0]) / weights[0][0])

    plane=pd.DataFrame({'x':x, 'y':y, 'z':[z] * 3})

    light_yellow = [[0, '#89CFF0'], [1, '#FFDB58']]
    model = go.Figure(data=go.Surface(x=plane['x'], y=plane['y'], z=plane['z'],
                                      colorscale=light_yellow))

  fig.add_trace(model.data[0], row=1, col=2)

  return

def model_info(feature_names, label_name, model_output):
  weights = model_output[0]
  bias = model_output[1]

  nl = "\n"
  header = "-" * 80
  banner = header + nl + "|" + "MODEL INFO".center(78) + "|" + nl + header

  info = ""
  equation = label_name + " = "

  for index, feature in enumerate(feature_names):
    info = info + "Weight for feature[{}]: {:.3f}\n".format(feature, weights[index][0])
    equation = equation + "{:.3f} * {} + ".format(weights[index][0], feature)

  info = info + "Bias: {:.3f}\n".format(bias[0])
  equation = equation + "{:.3f}\n".format(bias[0])

  return banner + nl + info + nl + equation

print("SUCCESS: defining plotting functions complete.")

SUCCESS: defining plotting functions complete.


In [83]:
#@title Code - Define ML functions

def build_model(my_learning_rate, num_features):
  """Create and compile a simple linear regression model."""
  # Most simple keras models are sequential.
  model = keras.models.Sequential()

  # Describe the topography of the model.
  # The topography of a simple linear regression model
  # is a single node in a single layer.
  model.add(keras.layers.Dense(units=1,
                                  input_shape=(num_features,)))

  # Compile the model topography into code that TensorFlow can efficiently
  # execute. Configure training to minimize the model's mean squared error.
  model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=my_learning_rate),
                loss="mean_squared_error",
                metrics=[keras.metrics.RootMeanSquaredError()])

  return model


def train_model(model, df, features, label, epochs, batch_size):
  """Train the model by feeding it data."""

  # Feed the model the feature and the label.
  # The model will train for the specified number of epochs.
  # input_x = df.iloc[:,1:3].values
  # df[feature]
  history = model.fit(x=features,
                      y=label,
                      batch_size=batch_size,
                      epochs=epochs)

  # Gather the trained model's weight and bias.
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch

  # Isolate the error for each epoch.
  hist = pd.DataFrame(history.history)

  # To track the progression of training, we're going to take a snapshot
  # of the model's root mean squared error at each epoch.
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse


def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):

  print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))

  num_features = len(feature_names)

  features = df.loc[:, feature_names].values
  label = df[label_name].values
  # display(features)
  # display(label)
  model = build_model(learning_rate, num_features)
  model_output = train_model(model, df, features, label, epochs, batch_size)

  print('\nSUCCESS: training experiment complete\n')
  print('{}'.format(model_info(feature_names, label_name, model_output)))
  make_plots(df, feature_names, label_name, model_output)

  return model

print("SUCCESS: defining linear regression functions complete.")

SUCCESS: defining linear regression functions complete.


Observations are pretty intuitive.
60 minutes before, within 29 mg/dl (doesn't seem very good). The mean sgv-60 is -38, so I think this is a little better than choosing at random on average (but not very much)
80 minutes before, within about 29 (also not very good)




Adding hour seemed to 'help' (28 instead of 30) but not practically better

Adding in delta doesn't seem to matter. What if we look further into the past?

In [99]:
#@title Code - Experiment 3 - multidimensional (sgv-?, and hour)

# The following variables are the hyperparameters.
learning_rate = 0.0001
epochs = 1000
batch_size = 50

features_all = []
for min in range(60, 121, 10):
  features_all.append(f"sgv-{min}")
  features_all.append(f"sgv-{min}delta")
  features_all.append(f"iob-{min}")
  features_all.append(f"basaliob-{min}")
  features_all.append(f"activity-{min}")
display(features_all)
# features_all = ['sgv-60', 'sgv-60delta','sgv-70', 'sgv-70delta','sgv-80', 'sgv-80delta','sgv-90', 'sgv-90delta','hour']
label = 'sgv'

model_all = run_experiment(df.dropna(), features_all, label, learning_rate, epochs, batch_size)

['sgv-60',
 'sgv-60delta',
 'iob-60',
 'basaliob-60',
 'activity-60',
 'sgv-70',
 'sgv-70delta',
 'iob-70',
 'basaliob-70',
 'activity-70',
 'sgv-80',
 'sgv-80delta',
 'iob-80',
 'basaliob-80',
 'activity-80',
 'sgv-90',
 'sgv-90delta',
 'iob-90',
 'basaliob-90',
 'activity-90',
 'sgv-100',
 'sgv-100delta',
 'iob-100',
 'basaliob-100',
 'activity-100',
 'sgv-110',
 'sgv-110delta',
 'iob-110',
 'basaliob-110',
 'activity-110',
 'sgv-120',
 'sgv-120delta',
 'iob-120',
 'basaliob-120',
 'activity-120']

INFO: starting training experiment with features=['sgv-60', 'sgv-60delta', 'iob-60', 'basaliob-60', 'activity-60', 'sgv-70', 'sgv-70delta', 'iob-70', 'basaliob-70', 'activity-70', 'sgv-80', 'sgv-80delta', 'iob-80', 'basaliob-80', 'activity-80', 'sgv-90', 'sgv-90delta', 'iob-90', 'basaliob-90', 'activity-90', 'sgv-100', 'sgv-100delta', 'iob-100', 'basaliob-100', 'activity-100', 'sgv-110', 'sgv-110delta', 'iob-110', 'basaliob-110', 'activity-110', 'sgv-120', 'sgv-120delta', 'iob-120', 'basaliob-120', 'activity-120'] and label=sgv

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/100

Adding in more history of sgvs and deltas doesn't help loss. We still end up with RMSE of ~29.

# Part 4 - Validate Model


---


## Use the model to make predictions

Now that you have a trained model, you can use the model to make predictions. In practice, you should make predictions on examples that are not used during training. However, for this exercise, you'll just work with a subset of the same training dataset. In another Colab exercise you will explore ways to make predictions on examples not used in training.

In [100]:
#@title Code - Define functions to make predictions
def format_currency(x):
  return "${:.2f}".format(x)

def build_batch(df, batch_size):
  batch = df.sample(n=batch_size).copy()
  batch.set_index(np.arange(batch_size), inplace=True)
  return batch

def predict_fare(model, df, features, label, batch_size=50):
  batch = build_batch(df, batch_size)
  predicted_values = model.predict_on_batch(x=batch.loc[:, features].values)

  data = {"PREDICTED_SGV": [], "OBSERVED_SGV": [], "L1_LOSS": [],
          features[0]: [], features[1]: []}
  for i in range(batch_size):
    predicted = predicted_values[i][0]
    observed = batch.at[i, label]
    data["PREDICTED_SGV"].append(predicted)
    data["OBSERVED_SGV"].append(observed)
    data["L1_LOSS"].append(abs(observed - predicted))
    data[features[0]].append(batch.at[i, features[0]])
    data[features[1]].append("{:.2f}".format(batch.at[i, features[1]]))

  output_df = pd.DataFrame(data)
  return output_df

def show_predictions(output):
  header = "-" * 80
  banner = header + "\n" + "|" + "PREDICTIONS".center(78) + "|" + "\n" + header
  print(banner)
  print(output)
  return

In [101]:
#@title Code - Make predictions

output = predict_fare(model_all, df, features_all, label)
show_predictions(output)

--------------------------------------------------------------------------------
|                                 PREDICTIONS                                  |
--------------------------------------------------------------------------------
    PREDICTED_SGV  OBSERVED_SGV    L1_LOSS  sgv-60 sgv-60delta
0       82.690659           120  37.309341    72.0     -263.89
1       82.730995           112  29.269005    82.0     -402.44
2      121.121063            95  26.121063   130.0       69.23
3      130.830215           128   2.830215   130.0     -146.15
4      123.888092           125   1.111908    45.0     -688.89
5      106.018288           110   3.981712   121.0       74.38
6      144.315430           173  28.684570   103.0     -427.18
7      108.733032           127  18.266968    79.0     -164.56
8       96.583191           101   4.416809    90.0      100.00
9      129.435028           136   6.564972   142.0      147.89
10     100.700096           106   5.299904    99.0      -90.91
1