# Multivariate time series forecasting with FB Prophet

* There are 420k rows X 14 cols of meteorological data : temp, humidity, pressure, wind, etc
* Time steps - one row every 10 minutes
* The task is to forecast the temperature
* The metrics : MAE and RMSE

### Naive model (temp at n+1 = temp at n) RMSE = 0.03
... that translates to a mean error of **0.16 Celsius** ...*This one will be very difficult to beat!*

### Univariate: we look ONLY at temp - disregarding all other info RMSE = 0.7
### Multivariate: we consider ALL variables when predicting the temp RMSE = 0.004


* Excellent explanation on multivariate time series forecasting at https://github.com/walesdata/2Dconv_pub/blob/master/gefcom_multiconv.ipynb

In [None]:
import sys
from os.path import join
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error
from numpy import sqrt
import itertools

import tensorflow as tf
from tensorflow.python.keras.applications.resnet50 import preprocess_input
from tensorflow.python.keras.preprocessing.image import load_img, img_to_array

from fbprophet import Prophet
from fbprophet.diagnostics import performance_metrics, cross_validation

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Data

In [None]:
# Load data

f = open('/kaggle/input/weather-archive-jena/jena_climate_2009_2016.csv')
data = f.read()
f.close()
lines = data.split('\n')
header = lines[0].split(',')
lines = lines[1:]
print(len(lines))
for i,j in enumerate(header):
    print(i,j)

In [None]:
# Convert lines list into Numpy array float_data - without the date

float_data = np.zeros((len(lines), len(header) - 1))
for i, line in enumerate(lines):
    values = [float(x) for x in line.split(',')[1:]]
    float_data[i, :] = values
              
print(float_data.shape)

In [None]:
temp = float_data[:, 1] 
plt.plot(range(len(temp)), temp)

In [None]:
plt.plot(range(1000), temp[1000:2000])

In [None]:
# Normalize data

print(float_data[0])

mean = float_data.mean(axis=0)
float_data -= mean
std = float_data.std(axis=0)
float_data /= std

print(float_data[0])

In [None]:
temp = float_data[:, 1] 

In [None]:
plt.plot(range(1000), temp[1000:2000])

# Sanity check - Baseline - Naive model ... temp at n+1 = temp at n

In [None]:
prev_val = temp[0]
sum_mae = 0
sse = 0
for n in range(0, len(temp)-1):
    err = temp[n] - prev_val
    sq_err = err ** 2
    sse = sse + sq_err
    prev_val = temp[n]
    sum_mae = sum_mae + np.abs(err)
    
mse = sse / n
rmse = np.sqrt(mse)
mae = sum_mae / n

print('RMSE =',round(rmse,5))
print('MAE =',round(mae,5))
print('Celsius MAE ', round(mae * std[1],2))


In [None]:
# Naive model prediction on a subset

UlimPred = 200
LlimPred = 100

predTemp = []

for n in range(LlimPred, UlimPred):
    predTemp.append(temp[n-1])
    
predTemp = np.array(predTemp)  
predTemp.shape


In [None]:
# Superimposed: model prediction (blue) vs reality (red)

plt.rcParams["figure.figsize"] = [16,9]

# Chart Test
UlimReal = 200
LlimReal = 100
SampleSizeReal = UlimReal - LlimReal
plt.plot(range(SampleSizeReal), temp[LlimReal:UlimReal], 'r', label="Actual")

# Chart Predicted 
UlimPred = UlimReal - LlimReal
LlimPred = 0
SampleSizePred = UlimPred - LlimPred
plt.legend('Actual', 'Predicted')
plt.plot(range(SampleSizePred), predTemp[LlimPred:UlimPred], 'b', label="Predicted")
plt.legend(loc="upper left")

plt.title('Naive model RMSE = 0.03')
plt.xlabel('Time')
plt.ylabel('Temp normalized')

plt.show()


### Baseline naive model RMSE = 0.029 MAE = 0.019

### This naive model will be *very difficult* to beat...

# FB Prophet univariate

### prophet REQUIRES a pandas df at the below config ...date column named as DS and the value column as Y


In [None]:
date_rng = pd.date_range(start='1/1/2009', end='1/1/2017', freq='10T')
date_rng[0:420551]

In [None]:
temp4Prophet = pd.DataFrame(temp)
temp4Prophet.columns=['y']
temp4Prophet['ds'] = date_rng[0:420551]
temp4Prophet = temp4Prophet[['ds','y']]
temp4Prophet

### We reduce dataset for Prophet to one sample every 6 steps - one sample per hour instead of one every 10 minutes

In [None]:
plt.plot(temp4Prophet['y'])

In [None]:
print(temp4Prophet.shape)
temp4Prophet = temp4Prophet.iloc[::6, :]
temp4Prophet.shape

In [None]:
plt.plot(temp4Prophet['y'])


In [None]:
ds4Naive = pd.DataFrame(temp4Prophet['y'])
ds4Naive.reset_index(inplace=True)
ds4Naive.drop('index', axis=1, inplace=True)
ds4Naive

In [None]:
# Baseline naive model on the smaller dataset

prev_val = ds4Naive.iloc[0]['y']
sum_mae = 0
sse = 0

for n in range(0, len(ds4Naive)-1):
    err = ds4Naive.iloc[n]['y'] - prev_val
    sq_err = err ** 2
    sse = sse + sq_err
    prev_val = ds4Naive.iloc[n]['y']
    sum_mae = sum_mae + np.abs(err)
    
mse = sse / n
rmse = np.sqrt(mse)
mae = sum_mae / n

print('RMSE =',round(rmse,5))
print('MAE =',round(mae,5))
print('Celsius MAE ', round(mae * std[1],2))

### Baseline for the reduced dataset RMSE = 0.121 ... MAE = 0.084

In [None]:
#Split into train = 300k/6 and test = 420551/6 - 300k/6

trainProphet = temp4Prophet.iloc[0:50000]
print(trainProphet.shape)
trainProphet.head()

In [None]:
testProphet = temp4Prophet.iloc[50001:]
print(testProphet.shape)
testProphet.head()

## Prophet hyper param optimization


### Following got stuck after 2.5 hours...

#Prophet hyper param optimization


param_grid = {  
    'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
    'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
}

#Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []  # Store the RMSEs for each params here

#Use cross validation to evaluate all parameters
for params in all_params:
    m = Prophet(**params).fit(trainProphet)  # Fit model with given params
    df_cv = cross_validation(m, horizon='30 days')
    df_p = performance_metrics(df_cv, rolling_window=1)
    rmses.append(df_p['rmse'].values[0])

#Find the best parameters
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
print(tuning_results)


https://facebook.github.io/prophet/docs/diagnostics.html#hyperparameter-tuning

**changepoint_prior_scale**: This is probably the most impactful parameter. It determines the flexibility of the trend, and in particular how much the trend changes at the trend changepoints. As described in this documentation, if it is too small, the trend will be underfit and variance that should have been modeled with trend changes will instead end up being handled with the noise term. If it is too large, the trend will overfit and in the most extreme case you can end up with the trend capturing yearly seasonality. **The default of 0.05** works for many time series, but this could be tuned; a range of [0.001, 0.5] would likely be about right. Parameters like this (regularization penalties; this is effectively a lasso penalty) are often tuned on a log scale.

**seasonality_prior_scale**: This parameter controls the flexibility of the seasonality. Similarly, a large value allows the seasonality to fit large fluctuations, a small value shrinks the magnitude of the seasonality. **The default is 10**., which applies basically no regularization. That is because we very rarely see overfitting here (there’s inherent regularization with the fact that it is being modeled with a truncated Fourier series, so it’s essentially low-pass filtered). A reasonable range for tuning it would probably be [0.01, 10]; when set to 0.01 you should find that the magnitude of seasonality is forced to be very small. This likely also makes sense on a log scale, since it is effectively an L2 penalty like in ridge regression.

**seasonality_mode**: Options are ['additive', 'multiplicative']. **Default is 'additive'**, but many business time series will have multiplicative seasonality. This is best identified just from looking at the time series and seeing if the magnitude of seasonal fluctuations grows with the magnitude of the time series (see the documentation here on multiplicative seasonality), but when that isn’t possible, it could be tuned.



### Train / Fit

In [None]:
%%time

# Train / Fit

# prophet REQUIRES a pandas df at the below config ... date column named as DS and the value column as Y

model = Prophet(changepoint_prior_scale=0.001,
               seasonality_prior_scale = 0.01)  

#model = Prophet(seasonality_mode='multiplicative', changepoint_prior_scale=0.5)  
#model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True, seasonality_mode='multiplicative')  
model.fit(trainProphet) # fit the model with the dataframe
print('Prophet is trained')

### Forecast / Predict

In [None]:
%%time

# Forecast / Predict

future = model.make_future_dataframe(periods = testProphet.shape[0], freq = '1H')  
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [None]:
dataList = list(testProphet.y.values)
print(len(dataList))
print(len(forecast['yhat'][50000:]))

rmse = sqrt(mean_squared_error(dataList,forecast['yhat'][50000:]))
print('Prophet RMSE: %.3f' % rmse)

mae = mean_absolute_error(dataList,forecast['yhat'][50000:])
print('Prophet MAE: %.3f' % mae)

# Hyper params optimization results

* seasonality_mode='additive' (default)


* changepoint_prior_scale=0.5 , seasonality_prior_scale = 10 ... RMSE = 0.772
* changepoint_prior_scale=0.1 , seasonality_prior_scale = 10 ... RMSE = 0.771
* changepoint_prior_scale=0.05 , seasonality_prior_scale = 10 ... RMSE =  0.755 (default)
* changepoint_prior_scale=0.01 , seasonality_prior_scale = 10 ... RMSE = 0.718
* changepoint_prior_scale=0.001 , seasonality_prior_scale = 10 ... RMSE = 0.707


* changepoint_prior_scale=0.001 , seasonality_prior_scale = 1 ... RMSE = 0.758
* changepoint_prior_scale=0.001 , seasonality_prior_scale = 0.1 ... RMSE = 0.758
* changepoint_prior_scale=0.001 , seasonality_prior_scale = 0.01 ... **RMSE = 0.705**


In [None]:
FigFor = model.plot(forecast)


In [None]:
figComp = model.plot_components(forecast)


In [None]:
plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList[0:20000], label='Original')
plt.plot(forecast['yhat'][50000:70000].values, ls='--', label="Predicted")
plt.legend(loc='best')
plt.title('FB Prophet univariate RMSE = 0.7')
plt.show()

In [None]:
plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList[0:500], label='Original')
plt.plot(forecast['yhat'][50000:50500].values, ls='--', label="Predicted")
plt.legend(loc='best')
plt.title('FB Prophet model - univariate')
plt.show()

# FB Prophet multivariate

* Add additional features / cols with add_regressor ... https://facebook.github.io/prophet/docs/seasonality,_holiday_effects,_and_regressors.html#additional-regressors

In [None]:
print(float_data.shape)
float_dataNoTemp = np.delete(float_data, 1, axis=1)
NoTemp = pd.DataFrame(float_dataNoTemp)
print(NoTemp.shape)
NoTemp.head()

In [None]:
temp4Prophet = pd.DataFrame(temp)
temp4Prophet.columns=['y']
temp4Prophet['ds'] = date_rng[0:420551]
temp4Prophet = temp4Prophet[['ds','y']]
print(temp4Prophet.shape)
temp4Prophet.head()

In [None]:
MultiVar4Prophet = pd.concat([temp4Prophet, NoTemp], axis=1)
MultiVar4Prophet.columns = ['ds', 'y', 'v0','v1','v2','v3','v4','v5','v6','v7','v8','v9','v10','v11', 'v12']
print(MultiVar4Prophet.shape)
MultiVar4Prophet

In [None]:
# Reduce dataset by a factor of 6 - one row per hour instead of every 10 minutes

print(MultiVar4Prophet.shape)
MultiVar4Prophet = MultiVar4Prophet.iloc[::6, :]
MultiVar4Prophet.shape

In [None]:
#Split into train = 300k/6 and test = 420551/6 - 300k/6

trainProphet = MultiVar4Prophet.iloc[0:50000]
print(trainProphet.shape)
trainProphet.head()

In [None]:
testProphet = MultiVar4Prophet.iloc[50001:]
testProphet.reset_index(inplace=True)
testProphet.drop('index', axis = 1, inplace=True)
print(testProphet.shape)
testProphet.head()

In [None]:
%%time

# Train

# Adding the other features / cols = Multivariate


model = Prophet(changepoint_prior_scale=0.001,
               seasonality_prior_scale = 0.01) 

model.add_regressor('v0')
model.add_regressor('v1')
model.add_regressor('v2')
model.add_regressor('v3')
model.add_regressor('v4')
model.add_regressor('v5')
model.add_regressor('v6')
model.add_regressor('v7')
model.add_regressor('v8')
model.add_regressor('v9')
model.add_regressor('v10')
model.add_regressor('v11')
model.add_regressor('v12')

model.fit(trainProphet) # fit the model with the dataframe
print('Prophet is trained')

In [None]:
testProphet.head()

In [None]:
%%time

# Forecast / Predict

future = model.make_future_dataframe(periods = testProphet.shape[0], freq = '1H')  
forecast = model.predict(testProphet)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [None]:
dataList = list(testProphet.y.values)
print(len(dataList))
print(len(forecast['yhat']))

In [None]:
rmse = sqrt(mean_squared_error(dataList,forecast['yhat']))
print('Prophet RMSE: %.3f' % rmse)

mae = mean_absolute_error(dataList,forecast['yhat'])
print('Prophet MAE: %.3f' % mae)

In [None]:
plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList, label='Original')
plt.plot(forecast['yhat'].values, ls='--', label="Predicted")
plt.legend(loc='best')
plt.title('FB Prophet multivariate RMSE = 0.004')
plt.show()

In [None]:
plt.style.use('seaborn-poster')
plt.figure()
plt.plot(dataList[0:500], label='Original')
plt.plot(forecast['yhat'][0:500].values, ls='--', label="Predicted")
plt.legend(loc='best')
plt.title('FB Prophet model - multivariate')
plt.show()