Installing the SciKit Modules

In [None]:
!pip install -U scikit-learn

Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score

In [None]:
def rmse(y,t):
    return np.sqrt(np.mean((y-t)**2))

def mape(y_true,y_pred):
  return mean_absolute_percentage_error(y_true, y_pred)

def mse(y,t):
    return np.mean((y-t)**2)

READING THE DATASET

In [None]:
df = pd.read_csv('Soil_20min_3M.csv',index_col='Date_time',parse_dates=True)
df['LogParameter'] = np.log(df['s_m_5'])
df['DiffLogParameter'] = df['LogParameter'].diff()
ntest_cases = 72
train_df = df.iloc[:-ntest_cases]
test_df = df.iloc[-ntest_cases:]

In [None]:
df['s_m_5'].plot(figsize=(15,5));

In [None]:
#Predict Next values using previous T values

series_df = df['DiffLogParameter'].to_numpy()[1:]

T = 108 #3 datapoints in 1 hour x 24 hours x 2 days = 144
X = []
Y = []

for t in range(len(series_df) - T):
  x = series_df[t:t+T]
  X.append(x)
  y = series_df[t+T]
  Y.append(y)

X = np.array(X).reshape(-1,T)
Y = np.array(Y)
N = len(X)
print ("X.shape",X.shape,"Y.shape",Y.shape)

In [None]:
X_train, Y_train = X[:-ntest_cases], Y[:-ntest_cases]
X_test, Y_test = X[-ntest_cases:], Y[-ntest_cases:]
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
#Boolean index
train_idx = df.index <= train_df.index[-1]
test_idx = ~train_idx

train_idx[:T+1] = False #First T values are not predictable
#print(train_idx)
#print("")
#print(test_idx)

In [None]:
#Needed to compute un-differenced predictions
df['ShiftLogParameter'] = df['LogParameter'].shift(1)
prev = df['ShiftLogParameter']

In [None]:
# Last - Know Train Value
last_train = train_df.iloc[-1]['LogParameter']

In [None]:
#For Multi-OP
Tx = T
Ty = ntest_cases
X = []
Y = []
for t in range(len(series_df) - Tx - Ty + 1):
  x = series_df[t:t+Tx]
  X.append(x)
  y = series_df[t+Tx:t+Tx+Ty]
  Y.append(y)

X = np.array(X).reshape(-1,Tx)
Y = np.array(Y).reshape(-1,Ty)
N = len(X)
print("X.shape",X.shape,"Y.shape",Y.shape)

In [None]:
X_train_m, Y_train_m = X[:-1], Y[:-1]
X_test_m, Y_test_m = X[-1:], Y[-1:]
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
# MAPE - different metric
test_log_pass = df.iloc[-ntest_cases:]['LogParameter']

TEST OTHER MODELS

[-5240] means only 80% of data is used for training the model, out of 6551 rows

In [None]:
def one_step_and_multistep(model,name):
  model.fit(X_train[-5240:],Y_train[-5240:])
  print("One-step forecast:",name)
  #print("Train R2:",model.score(Xtrain,Ytrain))
  #print("Test R2 (1step):",model.score(Xtest,Ytest))

  #store the onestep forecast
  df.loc[train_idx,f'{name}_1step_train'] = prev[train_idx] + model.predict(X_train)
  df.loc[test_idx,f'{name}_1step_test'] = prev[test_idx] + model.predict(X_test)

  #generating multistep forecast
  multistep_predictions = []

  #first test input
  last_x = X_test[0]

  while len(multistep_predictions) <ntest_cases:
    p = model.predict(last_x.reshape(1,-1))[0]

    #updating the predictions list
    multistep_predictions.append(p)

    #update the input provided
    last_x = np.roll(last_x,-1)
    last_x[-1] = p

  #store multistep forecast
  df.loc[test_idx, f'{name}_multistep_test'] = last_train + np.cumsum(multistep_predictions)

  #MAPE of multi-stp forecast
  ##rmse_score = rmse(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_multistep_test']))
  ##print("Test RMSE (multi-step):",rmse_score)
  ##mape_score = mape(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_multistep_test']))
  ##print("Test MAPE (multi-step):",mape_score)

  rmse_score = rmse(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test RMSE (one-step):",rmse_score)
  mape_score = mape(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test MAPE (one-step):",mape_score)
  mse_score = mse(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test MSE (one-step):",mse_score)
  r2print_score = r2_score(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test R2 (one-step):",r2print_score)

  print("\n")

  rmse_score = rmse(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_multistep_test']))
  print("Test RMSE (multi-step):",rmse_score)
  mape_score = mape(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_multistep_test']))
  print("Test MAPE (multi-step):",mape_score)
  mse_score = mse(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_multistep_test']))
  print("Test MSE (multi-step):",mse_score)
  r2print_score = r2_score(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_multistep_test']))
  print("Test R2 (multi-step):",r2print_score)

  #Plot 1-step and multistep forecast
  cols = ['LogParameter',f'{name}_1step_train',f'{name}_1step_test',f'{name}_multistep_test']
  df_temp = np.exp(df[cols][-ntest_cases:])
  df_temp.plot(figsize=(15,5));
  df_temp = np.exp(df[cols][-(3*ntest_cases):])
  df_temp.plot(figsize=(15,5));

In [None]:
one_step_and_multistep(LinearRegression(),"LR")

In [None]:
one_step_and_multistep(SVR(),"SVR")

In [None]:
one_step_and_multistep(RandomForestRegressor(),"RF")

MULTI OUTPUT FORECAST MODELS

In [None]:
def multi_output(model,name):
  model.fit(X_train_m[-5240:],Y_train_m[-5240:])

  #save multi-output forecast to dataframe
  df.loc[test_idx,f'{name}_multioutput'] = last_train + np.cumsum(model.predict(X_test_m).flatten())

  rmse_score = rmse(np.exp(test_log_pass), np.exp(df.loc[test_idx, f'{name}_multioutput']))
  print("Test RMSE (multi-output):",rmse_score)
  mape_score = mape(np.exp(test_log_pass), np.exp(df.loc[test_idx, f'{name}_multioutput']))
  print("Test MAPE (multi-output):",mape_score)
  mse_score = mse(np.exp(test_log_pass), np.exp(df.loc[test_idx, f'{name}_multioutput']))
  print("Test MSE (multi-output):",mse_score)
  r2_print_score = r2_score(np.exp(test_log_pass), np.exp(df.loc[test_idx, f'{name}_multioutput']))
  print("Test R2 (multi-output):",r2_print_score)

  print("\n")

  rmse_score = rmse(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test RMSE (one-step):",rmse_score)
  mape_score = mape(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test MAPE (one-step):",mape_score)
  mse_score = mse(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test MSE (one-step):",mse_score)
  r2print_score = r2_score(np.exp(test_log_pass),np.exp(df.loc[test_idx,f'{name}_1step_test']))
  print("Test R2 (one-step):",r2_print_score)

  #plot all forecasts
  #assume that previous functions have run
  cols = ['LogParameter',f'{name}_1step_train',f'{name}_1step_test',f'{name}_multistep_test',f'{name}_multioutput']
  df_temp = np.exp(df[cols][-ntest_cases:])
  df_temp.plot(figsize=(15,5));
  df_temp = np.exp(df[cols][-(3*ntest_cases):])
  df_temp.plot(figsize=(15,5));

In [None]:
multi_output(LinearRegression(),"LR")

In [None]:
#multi_output_forecast(SVR(),"SVR") #cannot handle multi-output!
#But: you can still train multiple seperate SVRs

In [None]:
multi_output(RandomForestRegressor(),"RF")