<a href="https://colab.research.google.com/github/smannan/LIFXBulbAnalysis/blob/main/PGE_ARIMA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install pmdarima

In [None]:
import pandas as pd

In [None]:
filename = '/content/drive/Shared drives/CMPE295B/ProcessedPGEData/pge_data_from_influx_2020-10-10-20_30_2021-10-01-00_00.csv'
data = pd.read_csv(filename, infer_datetime_format = True).drop(['Unnamed: 0'], axis=1, inplace=False)
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [None]:
data.head()

In [None]:
print ("total energy cost = {0}$ from {1} to {2}".format(data['cost'].sum(), data['timestamp'].min(), data['timestamp'].max()))

In [None]:
print ("total energy usage = {0} kWH from {1} to {2}".format(data['usage'].sum(), data['timestamp'].min(), data['timestamp'].max()))

In [None]:
print ("total readings in data frame {0}".format(len(data)))

In [None]:
def drop_hours_from_timestamp(bulb_data):
  bulb_data['timestamp'] = bulb_data['timestamp'].dt.strftime('%Y-%m-%d')
  bulb_data['timestamp'] = pd.to_datetime(bulb_data['timestamp'])

In [None]:
drop_hours_from_timestamp(data)

### Aggregate data daily

In [None]:
data_aggd = data.groupby(['timestamp']).agg({
  'cost': 'sum',
  'usage': 'sum'
})
data_aggd.reset_index(inplace=True)

In [None]:
print ("total readings in data frame aggregated daily {0}".format(len(data_aggd)))

In [None]:
data_aggd.head()

### Plot cost time series and view stats

In [None]:
data_aggd['cost'].describe()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

plt.plot(data_aggd['timestamp'], data_aggd['cost'])
plt.gcf().autofmt_xdate()
plt.title('Household electricity daily cost over time')
plt.ylabel('Cost in dollars')
plt.xlabel('Time')
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data_aggd[['cost']], lags=60)
plt.xlabel('Lag')
plt.ylabel('Correlation')
plt.show()

In [None]:
def split_train_test(data, split=0.8):
  arima_df = data[['timestamp', 'cost']]
  arima_df.set_index('timestamp', drop=True, inplace=True)
  values = arima_df.values
  size = int(len(values) * split)
  train = values[:size]
  test = values[size:]
  return (train, test)

In [None]:
train, test = split_train_test(data_aggd)
print (len(train), len(test))

In [None]:
from pmdarima.arima import ADFTest
ADFTest(alpha = 0.05).should_diff(train)

In [None]:
from pmdarima.arima import auto_arima

auto_arima_model = auto_arima(train)
auto_arima_model.summary()

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from math import sqrt

def rolling_predictions(train, test, order, modelType='ARIMA'):
  # put training points in a list
  history = [x for x in train]
  # for storing predictions
  predictions = list()

  # walk-forward validation
  for t in range(len(test)):
    # fit new model each iteration based on all past data points
    # get the next datapoint and add it to predictions
    if (modelType == 'SARIMAX'):
      model = SARIMAX(history, order=order).fit()
    else:
      model = ARIMA(history, order=order).fit()
    
    output = model.forecast()[0]
    
    predictions.append(output)
    history.append(test[t])

  rmse = sqrt(mean_squared_error(test, predictions))
  return (predictions, rmse)

In [None]:
predictions_sarimax, rmse_sarimax = rolling_predictions(train, test, order=(0, 1, 2), modelType='SARIMAX')

In [None]:
def plot_preds(test, predictions):
  plt.title('Daily household energy cost predictions')
  plt.xlabel('Timestep (days)')
  plt.ylabel('Cost in Dollars')
  plt.plot(test)
  plt.plot(predictions, color='red')
  plt.show()

In [None]:
plot_preds(test, predictions_sarimax)
print (rmse_sarimax)

In [None]:
predictions_arima, rmse_arima = rolling_predictions(train, test, order=(7, 1, 2), modelType='ARIMA')

In [None]:
plot_preds(test, predictions_arima)
print (rmse_arima)

- SARIMAX(0, 1, 2) = 0.6299 RMSE
- ARIMA (7, 1, 2) = 0.6413 RMSE