### Predicting number of bikes shared, without using number from previous timeframe as feature

In [None]:
# For interactive plots
%matplotlib notebook
# Requires javascript for jupyter lab
#import matplotlib.ipympl
#%matplotlib widget
# Fallback
#%matplotlib inline

# Imports
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os

# Imports 
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout , LSTM , Bidirectional 
from keras.regularizers import L1L2
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from matplotlib.gridspec import GridSpec

# for use with kaggle, directory for dataset
os.chdir("/kaggle/input/london-bike-sharing-dataset")

# Read dataset from file
df = pd.read_csv(
    "london_merged.csv", #for use with kaggle and built in dataset
    #"LSTMdataset.csv", #for use with LSTM dataset
    parse_dates=['timestamp'],
    index_col="timestamp"
)

df.head()

# Expanded index, to put information in separate columns
df['hour'] = df.index.hour
df['day_of_month'] = df.index.day
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month

In [None]:
# Calculate and plot Pearsons correlation between variables, investigating linear correlations
corr_mat = df.corr()
corr_mat.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
# Various plots between features and target
fig, axes = plt.subplots(3, 3, figsize=(12, 12))
sns.regplot(ax=axes[0,0], x="t1", y="hum", data=df)
sns.regplot(ax=axes[0,1], x="t1", y="cnt", data=df)
sns.regplot(ax=axes[0,2], x="t2", y="cnt", data=df)
sns.regplot(ax=axes[1,0], x="hum", y="cnt", data=df)
sns.boxplot(ax=axes[1,1], x="weather_code", y="cnt", data=df)
sns.boxplot(ax=axes[1,2], x="is_holiday", y="cnt", data=df)
sns.boxplot(ax=axes[2,0], x="is_weekend", y="cnt", data=df)
sns.regplot(ax=axes[2,1], x="wind_speed", y="cnt", data=df)
sns.regplot(ax=axes[2,2], x="hour", y="cnt", data=df)

In [None]:
# Various plots between features and target
fig, axes = plt.subplots(3, 3, figsize=(12, 12))
sns.pointplot(ax=axes[0,0], x='t1',y='cnt',data=df, color="red")
sns.pointplot(ax=axes[0,1], x='t2',y='cnt', data=df, color="green")
sns.pointplot(ax=axes[0,2], x="hour", y="cnt", data=df, color="blue")
sns.pointplot(ax=axes[1,0], x="weather_code", y="cnt", data=df, color="cyan")
sns.pointplot(ax=axes[1,1], x="day_of_week", y="cnt", data=df, color="yellow")
sns.pointplot(ax=axes[1,2], x="hum", y="cnt", data=df, color="magenta")
sns.pointplot(ax=axes[2,0], x="wind_speed", y="cnt", data=df, color="black")
sns.boxplot(ax=axes[2,1], x="is_weekend", y="cnt", data=df)
sns.boxplot(ax=axes[2,2], x="is_holiday", y="cnt", data=df)

In [None]:
# If not enough memory, reduce max_depth
from sklearn.ensemble import ExtraTreesClassifier

plt.figure(figsize=(12,12))
model = ExtraTreesClassifier(max_depth=12, bootstrap=True)
model.fit(df.iloc[:,1:],df['cnt'])
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=(df.iloc[:,1:]).columns)
feat_importances.nlargest(12).plot(kind='barh')

plt.show()

In [None]:
# Filter out which columns to use
# Trying to predict without using 'cnt' as feature

# seems to work well with low validation error.
columns = ['cnt', 't1', 't2', 'hum', 'wind_speed', 'weather_code', 'is_weekend', 'is_holiday', 'hour']

# Dataframe with only chosen columns
df = df.loc[:, columns]

In [None]:
# Build model and precict.

# Input to model
hours_to_predict = 24
window_in = 5
batch_in = 64
epoch_in = 500
print("Predicting number of bike shared for ",hours_to_predict," hours. \n")
print("Please enter starting hour to predict from: \n")
startar = input()
startar = int(startar)

# Split data into train set (first 80%) and validation set (last 20%)
train_df_size = int(len(df) * 0.8)
validate_df_size = int((len(df) - train_df_size))
train_df = df.iloc[:train_df_size]
validate_df = df.iloc[train_df_size:]
print(train_df.iloc[(startar+1):(startar+1+hours_to_predict),:])  # Inspect the dataframe

# Separate the data frames into x and y values, with Y set to predict the next step
y_train = train_df.iloc[1:, 0].values  # Separate the cnt column as y, start at index 1 (what we want to predict)
x_train = train_df.iloc[:-1, 1:].values  # Do not include the last element, to match size, remove target
y_validate = validate_df.iloc[1:, 0].values  # Separate the cnt column as y, start at index 1 (what we want to predict)
x_validate = validate_df.iloc[:-1, 1:].values  # Do not include the last element, to match size, remove target

# Scalers, for x and y separate, choose one
# MinMaxscaler
scaler_y = MinMaxScaler()
scaler_x = MinMaxScaler()
# Standardscaler
#scaler_x = StandardScaler()
#scaler_y = StandardScaler()
# Robustscaler
#scaler_y = RobustScaler()
#scaler_x = RobustScaler()

# Scaling
y_train = scaler_y.fit_transform(y_train.reshape(-1, 1))
y_validate = scaler_y.transform(y_validate.reshape(-1, 1))
x_train = scaler_x.fit_transform(x_train)
x_validate = scaler_x.transform(x_validate)

# Create the batch datasets with moving windows
window_size = window_in
train_batch = tf.keras.preprocessing.timeseries_dataset_from_array(
    data=x_train,
    targets=y_train,
    sequence_length=window_size,
    sequence_stride=1,
    shuffle=False,
    batch_size=batch_in)
validate_batch = tf.keras.preprocessing.timeseries_dataset_from_array(
    data=x_validate,
    targets=y_validate,
    sequence_length=window_size,
    sequence_stride=1,
    shuffle=False,
    batch_size=batch_in)

# Regularization parameters
l1 = 0.0001
l2 = 0.0001
dropout_param = 0.5

# Define the LSTM model
lstm_model = tf.keras.models.Sequential([
    #tf.keras.layers.LSTM(units=10),
    #tf.keras.layers.Bidirectional(LSTM(64, activation='relu', kernel_regularizer=L1L2(l1, l2), return_sequences=True), merge_mode='concat'),
    #tf.keras.layers.Dropout(dropout_param),
    tf.keras.layers.Bidirectional(LSTM(64, activation='relu', kernel_regularizer=L1L2(l1, l2)), merge_mode='concat'), 
    tf.keras.layers.Dropout(dropout_param),
    tf.keras.layers.Dense(units=1)
])
    
lstm_model.compile(loss='mean_squared_error', optimizer='adam')

# Train the LSTM model
history = lstm_model.fit(
    train_batch,
    epochs=epoch_in,
    batch_size=batch_in,
    validation_data=validate_batch,
    shuffle=False,
)

# Create the prediction dataset, without the known Y target values
predict_batch = tf.keras.preprocessing.timeseries_dataset_from_array(
    data=x_validate,
    targets=None,
    sequence_length=window_size,
    sequence_stride=1,
    shuffle=False,
    batch_size=batch_in)

# Predict the future
y_predicted = lstm_model.predict(predict_batch)

# Plot the loss/val_loss and the predicted values
plt.figure(figsize=(12, 8))
gs = GridSpec(1, 4)
plt.subplot(gs[0, 0])
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.subplot(gs[0, 1:])
plt.plot(np.arange(0, len(y_train)), y_train, color='r', label="history")  # Uncomment to see training data as well
plt.plot(np.arange(len(y_train), len(y_train) + len(y_validate)), y_validate, color='b', marker='.',
         label="true validate set")
plt.plot(np.arange(len(y_train), len(y_train) + len(y_predicted)), y_predicted, color='g', marker='.',
         label="prediction of validation set")
plt.ylabel('Bike Count')
plt.xlabel('Time Step')
plt.legend()
plt.show()

# DT069A Laboration 2 TestPrediction

# Part of x_train to use for prediction selected in for-loop
x_predict = x_train

# List for predictions
listan =  np.array([])

for i in range(0,hours_to_predict):
    # Create a working batch
    window_size = window_in
    predict_batch_part = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=x_predict[i+startar:(i+startar+window_size),:],
        targets=None,
        sequence_length=window_size,
        sequence_stride=1,
        shuffle=False,
        batch_size=batch_in)

    # Predict using model
    y_predicted_part = lstm_model.predict(predict_batch_part)
    
    # Inverse transform
    y_predicted_part = scaler_y.inverse_transform(y_predicted_part)
    
    # Append list
    listan = np.append(listan, y_predicted_part)

# Interpret the results
print("Estimated number of bike shared for next ",hours_to_predict," hours: ", listan.astype('int'))
df_out = train_df.iloc[(startar+1):(startar+1+hours_to_predict),0]
print("(Should be around: ",df_out.to_list()," for the example data)")


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

scaler_y.inverse_transform(y_validate)
scaler_y.inverse_transform(y_predicted)

#RMSE_minmax = sqrt(mean_squared_error(y_validate[0:-4], y_predicted)) 
#RMSE_standard = sqrt(mean_squared_error(y_validate[0:-4], y_predicted)) 
#RMSE_robust = sqrt(mean_squared_error(y_validate[0:-4], y_predicted)) 

print(RMSE_minmax)
print(RMSE_standard)
print(RMSE_robust)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

scaler_y.inverse_transform(y_validate)
scaler_y.inverse_transform(y_predicted)

RMSE_64 = sqrt(mean_squared_error(y_validate[0:-4], y_predicted)) 
print(RMSE_)
print(RMSE_20)
print(RMSE_10)
print(RMSE_32)
print(RMSE_64) # 32 best so far, have not tried 64 yet

