In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import mean_absolute_percentage_error, r2_score

In [None]:
def rmse(y,t):
    return np.sqrt(np.mean((y-t)**2))

def mape(y_true,y_pred):
  return mean_absolute_percentage_error(y_true, y_pred)

def mse(y,t):
    return np.mean((y-t)**2)

In [None]:
#df = pd.read_csv('GE.csv')
df = pd.read_csv('Soil_20min_3M.csv')
print(df.shape)

In [None]:
train_data_dates = pd.to_datetime(df['Date_time']) #Seperating the Dates Columns
#print(train_data_dates.tail(15)) #Print Last 15 dates

In [None]:
cols_rqd = list(df)[1:11] #columns to be used for training the model
print(cols_rqd) #['param1', 'param2', 'param3', 'param4', 'param5']

In [None]:
df_training = df[cols_rqd].astype(float) #DF with only the training columns
#print(df_training)

In [None]:
#Normalizing the data as LSTM is sensitive to magnitude (due to sigmoid and tanh)
scaler = StandardScaler()
scaler = scaler.fit(df_training)
df_training_scaled = scaler.transform(df_training)

In [None]:
#As required for LSTM networks, we require to reshape an input data into n_samples x timesteps x n_features.
#In this example, the n_features is 5. We will make timesteps = 14 (past days data used for training).
#Empty lists to be populated using formatted training data
trainX = []
trainY = []
n_future = 72  # Number of days we want to look into the future based on the past days.
n_past = 108   # Number of past days we want to use to predict the future.

In [None]:
#Reformat input data into a shape: (n_samples x timesteps x n_features)
#In my example, my df_training_scaled has a shape...
#... refers to the number of data points and 5 refers to the columns (multi-variables).
for i in range(n_past, len(df_training_scaled) - n_future +1):
    trainX.append(df_training_scaled[i - n_past:i, 0:df_training.shape[1]])
    trainY.append(df_training_scaled[i + n_future - 1:i + n_future, 2]) #0 represents the first col -- and this number is the column which goes to the Ytrain

trainX, trainY = np.array(trainX), np.array(trainY)

print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

In [None]:
# print(df_training_scaled[24:,0])
# print(df_training_scaled[24:,1])
# print(df_training_scaled[24:,2])
# print(df_training_scaled[24:,3])
# print(df_training_scaled[24:,4])
# print(df_training_scaled[24:,5])
# print(df_training_scaled[24:,6])
# print(df_training_scaled[24:,7])
# print(df_training_scaled[24:,8])
# print(df_training_scaled[24:,9])

In [None]:
trainX = trainX[-5240:]
trainY = trainY[-5240:]

In [None]:
print('trainX shape == {}.'.format(trainX.shape))
print('trainY shape == {}.'.format(trainY.shape))

In [None]:
print(trainY)

In [None]:
print(trainX.shape[1])
print(trainX.shape[2])
print(trainY.shape[1])

In [None]:
#DF (Samples, Features)
#Train_X (Samples - TimeStamp, TimeStamp, Features)
#Train_Y (Samples - TimeStamp, Prediction)

#The model predicts only single value, but #it needs multiple variables for making prediction.
#i.e. we can only predict a single day after the training, the day after where the data ends.
#To predict more days in future, we need all the input features which are not present, i.e. we would also be required to predict them.

#Autoencoder Model Defined

model = Sequential()
#model.add(LSTM(128, activation='relu', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
model.add(LSTM(64, activation='tanh', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='tanh', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(trainY.shape[1]))

model.compile(optimizer='adam', loss='mse')
model.summary()

# fit the model
history = model.fit(trainX, trainY, epochs=10, batch_size=10, validation_split=0.1, verbose=1)

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()

In [None]:
#Predicting the output

#Remember that we can only predict one day in future as our model needs 5 variables
#as inputs for prediction. We only have all 5 variables until the last day in our dataset.

n_past = 108
n_prediction = 72  #let us predict

predict_period_dates = pd.date_range(list(train_data_dates)[-n_past], periods=n_prediction, freq='T').tolist()
print(predict_period_dates)

#Make prediction
prediction = model.predict(trainX[-n_prediction:]) #shape = (n, 1) where n is the n_days_for_prediction


#Inverse transformation to rescale the data to original range.
#Number of features used for transform and inverse should be same.
#Therefore, let us copy our values 5 times and discard them after inverse transform
prediction_copies = np.repeat(prediction, df_training.shape[1], axis=-1)
y_pred_future = scaler.inverse_transform(prediction_copies)[:,2] # THIS INVERSE IS INLINE WITH THE COLUMNT OT BE PREDICTED

In [None]:
forecast_dates = []
for time_i in predict_period_dates:
    forecast_dates.append(time_i.date())

#df_forecast = pd.DataFrame({'Date':np.array(forecast_dates), 'Open':y_pred_future})
df_forecast = pd.DataFrame({'Date_time':np.array(forecast_dates), 's_m_5':y_pred_future})
df_forecast['Date_time']=pd.to_datetime(df_forecast['Date_time'])

original = df[['Date_time', 's_m_5']]
original['Date_time']=pd.to_datetime(original['Date_time'])
original = original.loc[original['Date_time'] >= '2010-1-1']

test1 = df_forecast.drop(['Date_time'], axis=1)
#print(test1)
test2 = original.iloc[-n_prediction:]
test2 = test2.reset_index()
test2 = test2.drop(['index'], axis=1)
#print(test2)

result = pd.concat([test2, test1], axis=1)
sns.set(rc={'figure.figsize':(15,10)})
plt.plot(result['Date_time'], result.iloc[:,1], label='s_m_5 (Original)')
plt.plot(result['Date_time'], result.iloc[:,2], label='s_m_5 (Forecast)')
#print(result)

print("RMSE   :",rmse(result.iloc[:,1],result.iloc[:,2]))
print("MAPE   :",mape(result.iloc[:,1],result.iloc[:,2]))
print("MSE    :",mse(result.iloc[:,1],result.iloc[:,2]))
print("R2     :",r2_score(result.iloc[:,1],result.iloc[:,2]))

In [None]:
#Predicting the output

#Remember that we can only predict one day in future as our model needs 5 variables
#as inputs for prediction. We only have all 5 variables until the last day in our dataset.

n_past = 108
n_prediction = 144  #let us predict

predict_period_dates = pd.date_range(list(train_data_dates)[-n_past], periods=n_prediction, freq='T').tolist()
print(predict_period_dates)

#Make prediction
prediction = model.predict(trainX[-n_prediction:]) #shape = (n, 1) where n is the n_days_for_prediction


#Inverse transformation to rescale the data to original range.
#Number of features used for transform and inverse should be same.
#Therefore, let us copy our values 5 times and discard them after inverse transform
prediction_copies = np.repeat(prediction, df_training.shape[1], axis=-1)
y_pred_future = scaler.inverse_transform(prediction_copies)[:,2] # THIS INVERSE IS INLINE WITH THE COLUMNT OT BE PREDICTED

In [None]:
forecast_dates = []
for time_i in predict_period_dates:
    forecast_dates.append(time_i.date())

#df_forecast = pd.DataFrame({'Date':np.array(forecast_dates), 'Open':y_pred_future})
df_forecast = pd.DataFrame({'Date_time':np.array(forecast_dates), 's_m_5':y_pred_future})
df_forecast['Date_time']=pd.to_datetime(df_forecast['Date_time'])

original = df[['Date_time', 's_m_5']]
original['Date_time']=pd.to_datetime(original['Date_time'])
original = original.loc[original['Date_time'] >= '2010-1-1']

test1 = df_forecast.drop(['Date_time'], axis=1)
#print(test1)
test2 = original.iloc[-n_prediction:]
test2 = test2.reset_index()
test2 = test2.drop(['index'], axis=1)
#print(test2)

result = pd.concat([test2, test1], axis=1)
sns.set(rc={'figure.figsize':(15,10)})
plt.plot(result['Date_time'], result.iloc[:,1], label='s_m_5 (Original)')
plt.plot(result['Date_time'], result.iloc[:,2], label='s_m_5 (Forecast)')
#print(result)

print("RMSE   :",rmse(result.iloc[:,1],result.iloc[:,2]))
print("MAPE   :",mape(result.iloc[:,1],result.iloc[:,2]))
print("MSE    :",mse(result.iloc[:,1],result.iloc[:,2]))
print("R2     :",r2_score(result.iloc[:,1],result.iloc[:,2]))

In [None]:
#Predicting the output

#Remember that we can only predict one day in future as our model needs 5 variables
#as inputs for prediction. We only have all 5 variables until the last day in our dataset.

n_past = 108
n_prediction = 504  #let us predict

predict_period_dates = pd.date_range(list(train_data_dates)[-n_past], periods=n_prediction, freq='T').tolist()
print(predict_period_dates)

#Make prediction
prediction = model.predict(trainX[-n_prediction:]) #shape = (n, 1) where n is the n_days_for_prediction


#Inverse transformation to rescale the data to original range.
#Number of features used for transform and inverse should be same.
#Therefore, let us copy our values 5 times and discard them after inverse transform
prediction_copies = np.repeat(prediction, df_training.shape[1], axis=-1)
y_pred_future = scaler.inverse_transform(prediction_copies)[:,2] # THIS INVERSE IS INLINE WITH THE COLUMNT OT BE PREDICTED

In [None]:
forecast_dates = []
for time_i in predict_period_dates:
    forecast_dates.append(time_i.date())

#df_forecast = pd.DataFrame({'Date':np.array(forecast_dates), 'Open':y_pred_future})
df_forecast = pd.DataFrame({'Date_time':np.array(forecast_dates), 's_m_5':y_pred_future})
df_forecast['Date_time']=pd.to_datetime(df_forecast['Date_time'])

original = df[['Date_time', 's_m_5']]
original['Date_time']=pd.to_datetime(original['Date_time'])
original = original.loc[original['Date_time'] >= '2010-1-1']

test1 = df_forecast.drop(['Date_time'], axis=1)
#print(test1)
test2 = original.iloc[-n_prediction:]
test2 = test2.reset_index()
test2 = test2.drop(['index'], axis=1)
#print(test2)

result = pd.concat([test2, test1], axis=1)
sns.set(rc={'figure.figsize':(15,10)})
plt.plot(result['Date_time'], result.iloc[:,1], label='s_m_5 (Original)')
plt.plot(result['Date_time'], result.iloc[:,2], label='s_m_5 (Forecast)')
#print(result)

print("RMSE   :",rmse(result.iloc[:,1],result.iloc[:,2]))
print("MAPE   :",mape(result.iloc[:,1],result.iloc[:,2]))
print("MSE    :",mse(result.iloc[:,1],result.iloc[:,2]))
print("R2     :",r2_score(result.iloc[:,1],result.iloc[:,2]))