In [59]:
import pandas as pd
import plotly.express as px
import numpy as np
import requests
import re
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
# For visualizing
import plotly.express as px
# For model building
import torch
import torch.nn as nn
import torch.nn.functional as F
# Import helpers
# import nnhelpers as nnh
from d2l import torch as d2l


In [60]:
#!pip install --force-reinstall pandas
!pip install d2l



In [61]:
df_raw = pd.read_csv(r"/content/drive/MyDrive/2012_2019_PlatteRiverWeir_features_merged_all.csv", parse_dates=["SensorTime"])
df_raw = df_raw.sort_values(by="SensorTime").reset_index(drop=True)
# remove space in front of column names using strip() function
# (there's a space before the column names in the original CSV file)
df_raw.rename(columns=lambda x: x.strip(), inplace=True)
df_raw.head()



Unnamed: 0,SensorTime,CaptureTime,Filename,Agency,SiteNumber,TimeZone,Stage,Discharge,CalcTimestamp,width,...,WeirPt2X,WeirPt2Y,WwRawLineMin,WwRawLineMax,WwRawLineMean,WwRawLineSigma,WwCurveLineMin,WwCurveLineMax,WwCurveLineMean,WwCurveLineSigma
0,2012-06-09 13:15:00,2012-06-09T13:09:07,StateLineWeir_20120609_Farrell_001.jpg,USGS,6674500,MDT,2.99,916.0,2020-03-11T16:58:28,4288,...,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2012-06-09 13:15:00,2012-06-09T13:10:29,StateLineWeir_20120609_Farrell_002.jpg,USGS,6674500,MDT,2.99,916.0,2020-03-11T16:58:33,4288,...,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2012-06-09 13:45:00,2012-06-09T13:44:01,StateLineWeir_20120609_Farrell_003.jpg,USGS,6674500,MDT,2.96,873.0,2020-03-11T16:58:40,4288,...,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2012-06-09 14:45:00,2012-06-09T14:44:30,StateLineWeir_20120609_Farrell_004.jpg,USGS,6674500,MDT,2.94,846.0,2020-03-11T16:58:47,4288,...,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2012-06-09 15:45:00,2012-06-09T15:44:59,StateLineWeir_20120609_Farrell_005.jpg,USGS,6674500,MDT,2.94,846.0,2020-03-11T16:58:55,4288,...,-1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
#Create daily mean values
df_raw['SensorTime']=pd.to_datetime(df_raw['SensorTime'])
df = df_raw.groupby(df_raw['SensorTime']).mean()
# Drop problematic and/or unnecessary columns
df = df.drop(labels=["Stage"], axis=1)
# ,"SiteNumber","areaFeatCount","WwCurveLineMin", "width", "height"], axis=1)
#Write CSV that holds daily means, for visual inspection
#df.to_csv("/content/NPRW_daily_means.csv", index = False)
#df.head(5)
df_new=df['Discharge']



The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [63]:
# train-test split for time series
train_size = int(len(df_new) * 0.9)
test_size = len(df_new) - train_size
train, test = df_new[:train_size], df_new[train_size:]

In [64]:
def create_dataset(dataset, lookback):
  X, y = [], []
  for i in range(len(dataset)-lookback):
    feature = dataset[i:i+lookback]
    target = dataset[i+1:i+lookback+1]
    X.append(feature)
    y.append(target)
  return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [65]:
lookback = 1
X_train, y_train = create_dataset(train, lookback=lookback)
X_test, y_test = create_dataset(test, lookback=lookback)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


torch.Size([36828, 1]) torch.Size([36828, 1])
torch.Size([4092, 1]) torch.Size([4092, 1])


In [66]:
class TempLSTM(nn.Module):
  def __init__(self):
    super().__init__()
    self.lstm = nn.LSTM(input_size=lookback, hidden_size=50, num_layers=1, batch_first=True)
    self.linear = nn.LazyLinear(1)
  def forward(self, x):
    x, _ = self.lstm(x)
    x = self.linear(x)
    return x

In [67]:
model = TempLSTM()
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.MSELoss()
loader = DataLoader(TensorDataset(X_train, y_train), shuffle=True, batch_size=32)
n_epochs = 200
for epoch in range(n_epochs):
  model.train()
  for X_batch, y_batch in loader:
    y_pred = model(X_batch)
    loss = loss_fn(y_pred, y_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # Validation
  if epoch % 10 != 0:
    continue
  model.eval()
  with torch.no_grad():
    y_pred = model(X_train)
    train_rmse = np.sqrt(loss_fn(y_pred, y_train))
    y_pred = model(X_test)
    test_rmse = np.sqrt(loss_fn(y_pred, y_test))
  print("Epoch %d: train RMSE %.4f, test RMSE %.4f" % (epoch, train_rmse, test_rmse))



Lazy modules are a new feature under heavy development so changes to the API or functionality can happen at any moment.



Epoch 0: train RMSE 1519.6163, test RMSE 1490.7883
Epoch 10: train RMSE 1267.3221, test RMSE 1163.2166
Epoch 20: train RMSE 1067.2773, test RMSE 863.7462
Epoch 30: train RMSE 924.1876, test RMSE 601.9641
Epoch 40: train RMSE 813.3911, test RMSE 378.2211
Epoch 50: train RMSE 717.9736, test RMSE 199.0095
Epoch 60: train RMSE 630.3966, test RMSE 78.1822
Epoch 70: train RMSE 618.1379, test RMSE 87.5200
Epoch 80: train RMSE 596.7820, test RMSE 41.8633
Epoch 90: train RMSE 562.1257, test RMSE 34.2842
Epoch 100: train RMSE 514.5970, test RMSE 37.7336
Epoch 110: train RMSE 494.2160, test RMSE 55.3973
Epoch 120: train RMSE 537.5453, test RMSE 78.5030
Epoch 130: train RMSE 531.2503, test RMSE 33.1950
Epoch 140: train RMSE 534.2621, test RMSE 37.2851
Epoch 150: train RMSE 512.4603, test RMSE 34.2429
Epoch 160: train RMSE 472.3960, test RMSE 56.7140
Epoch 170: train RMSE 508.5801, test RMSE 49.6762
Epoch 180: train RMSE 512.1698, test RMSE 37.3032
Epoch 190: train RMSE 476.0816, test RMSE 38.2652


In [71]:
with torch.no_grad():
  # shift train predictions for plotting
  train_plot = np.ones_like(df_new) * np.nan
  y_pred = model(X_train)
  y_pred = y_pred[:, -1]
  train_plot[lookback:train_size] = model(X_train)[:, -1]
  # shift test predictions for plotting
  test_plot = np.ones_like(df_new) * np.nan
  test_plot[train_size+lookback:len(df_new)] = model(X_test)[:, -1]


In [79]:
# Build plotting data

df_new=df_new.reset_index(drop=True)
plot_data = pd.DataFrame([df_new, test_plot])
plot_data=plot_data.T
plot_data.columns = ['truth','forecast']
px.line(plot_data, y = ['truth','forecast'],width=800, height=400)



In [75]:
plot_data

Unnamed: 0_level_0,truth,forecast
SensorTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-06-09 13:15:00,916.0,
2012-06-09 13:45:00,873.0,
2012-06-09 14:45:00,846.0,
2012-06-09 15:45:00,846.0,
2012-06-09 16:45:00,846.0,
...,...,...
2019-10-11 09:00:00,434.0,
2019-10-11 10:00:00,434.0,
2019-10-11 11:00:00,434.0,
2019-10-11 12:00:00,434.0,
