# Steam Data Cleaning and Modelling

# <u> **Steam Data Cleaning** </u>

In [None]:
#imports 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#load dataset 
steam = pd.read_csv("../input/bdgp2-further-cleaned-datasets/steam_cleaned2.csv")

In [None]:
steam.head()

In [None]:
steam.shape

In [None]:
steam.dtypes

Decided to change timestamp into DateTime object to resample into weekly data 

In [None]:
#change to DateTime format
steam["timestamp"] = pd.to_datetime(steam["timestamp"], format = "%Y-%m-%d %H:%M:%S")

In [None]:
#check if changed successfully 
steam.dtypes

In [None]:
#set time as index 
steam = steam.set_index("timestamp")

In [None]:
steam = steam.drop("Unnamed: 0", axis=1)

In [None]:
#average by week
steam = steam.resample("W").mean()

Created new data frames for each location to help predict the steam usage per location 

In [None]:
#separate by location 
Peacock = pd.DataFrame()
P = [col for col in steam.columns if 'Peacock' in col]
Peacock[P] = steam[P]

Moose = pd.DataFrame()
M = [col for col in steam.columns if 'Moose' in col]
Moose[M] = steam[M]
 
Bull = pd.DataFrame()
B = [col for col in steam.columns if 'Bull' in col]
Bull[B] = steam[B]

Hog = pd.DataFrame()
H = [col for col in steam.columns if 'Hog' in col]
Hog[H] = steam[H]

Eagle = pd.DataFrame()
E = [col for col in steam.columns if 'Eagle' in col]
Eagle[E] = steam[E]

Cockatoo = pd.DataFrame()
C = [col for col in steam.columns if 'Cockatoo' in col]
Cockatoo[C] = steam[C]

In [None]:
#check if data frames were created correctly and proper resampling 

In [None]:
Peacock.head()

In [None]:
Moose.head()

In [None]:
Bull.head()

In [None]:
Hog.head()

In [None]:
Eagle.head()

In [None]:
Cockatoo.head()

Summed up the usage of each location per week and created a new column 

Printed results to check that "Location Energy Sum" and locations were split correctly

In [None]:
Peacock["Location Energy Sum"] = Peacock.sum(axis=1)
Peacock.head()

In [None]:
Moose["Location Energy Sum"] = Moose.sum(axis=1)
Moose.head()

In [None]:
Bull["Location Energy Sum"] = Bull.sum(axis=1)
Bull.head()

In [None]:
Hog["Location Energy Sum"] = Hog.sum(axis=1)
Hog.head()

In [None]:
Eagle["Location Energy Sum"] = Eagle.sum(axis=1)
Eagle.head()

In [None]:
Cockatoo["Location Energy Sum"] = Cockatoo.sum(axis=1)
Cockatoo.head()

# <u> **Modelling by Location** </u>

In [None]:
#imports 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor

from sklearn import metrics

from sklearn.model_selection import cross_validate

import datetime as dt

import seaborn as sns

# Peacock

Attemped basic models first: linear regression, decision tree, elastic net, lasso and ridge 

In [None]:
Peacock = Peacock.reset_index() #reset index 

In [None]:
#convert back into numerical value to be able to model using linear regression
Peacock ['timestamp'] = Peacock["timestamp"].map(dt.datetime.toordinal) #returns the proleptic Gregorian ordinal of a date

In [None]:
X = Peacock["timestamp"]  # numpy array
y = Peacock["Location Energy Sum"] # numpy array

In [None]:
model = LinearRegression() #instantiate model

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2) #20% for testing 

#required reshaping to fit and predict for the model
X_train_reshape = X_train.values.reshape(-1,1)
X_test_reshape = X_test.values.reshape(-1,1)

In [None]:
#proper shape for modelling and predicting 
print(X_train_reshape.shape)
print(y_train.shape)

print(X_test_reshape.shape)
print(y_test.shape)

In [None]:
model.fit(X_train_reshape, y_train)

In [None]:
predictions = model.predict(X_test_reshape)

In [None]:
for y, y_pred in list(zip(y_test, predictions))[:5]:
    print("Real value: {:.3f} Estimated value: {:.5f}".format(y, y_pred))

In [None]:
rsq = metrics.r2_score(y_test, predictions)
print(f"the R-squaared score is {rsq}")

mse = metrics.mean_squared_error(y_test, predictions)
print(f"the Mean Absolute Error is {mse}")

rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print(f"the Root Mean Squared Error is {rmse}")

Initial model done with Lienar Regression
 - Metrics are not great 
 - Next: try cross validation approach with different models and compare between different models

Cross-validation Approach

In [None]:
results = {} #to store results 

In [None]:
def evaluate_model(estimator, X, y):
    cv_results = cross_validate(estimator,
                    X=X,
                    y=y,
                    scoring="neg_mean_squared_error",
                          n_jobs=-1, cv=50,
                     return_train_score=True)
    return pd.DataFrame(cv_results).abs().mean().to_dict()

In [None]:
linreg  = LinearRegression()
dtree   = DecisionTreeRegressor()
elastic = ElasticNet()
lasso   = Lasso()
ridge   = Ridge()

In [None]:
results["linear reg"] = evaluate_model(linreg, X_train_reshape, y_train)
results["tree"] = evaluate_model(dtree, X_train_reshape, y_train)
results["elasticnet"] = evaluate_model(elastic, X_train_reshape, y_train)
results["lasso"] = evaluate_model(lasso, X_train_reshape, y_train)
results["ridge"] = evaluate_model(ridge, X_train_reshape, y_train)

pd.DataFrame.from_dict(results).T

test_score and train_score for all models are not optimal 
 - Next, try ARIMA model

## ARIMAX Modelling

In [None]:
from scipy.stats import norm
import statsmodels.api as sm
import matplotlib.pyplot as plt

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

In [None]:
sns.scatterplot(Peacock["timestamp"],Peacock["Location Energy Sum"] )

On the scatterplot, we are plotting data from 2 years on the x axis and the steam usage on the y axis.
* We can see that there is a trend for each year and is related to the seaons 
    * Usage is higher in the winter 
    
We will use the first year to train the model and the second year to predict 

In [None]:
peacock_model_data = Peacock[["Location Energy Sum"]]
train = peacock_model_data.iloc[0:(len(peacock_model_data)-53)].copy()
test = peacock_model_data.iloc[len(train):(len(peacock_model_data) -1)].copy()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
ax = train.plot(figsize=(25,4))
test.plot(ax=ax)

In [None]:
sm.graphics.tsa.plot_pacf(train,lags=30)
plt.show()

In [None]:
sm.graphics.tsa.plot_acf(train,lags=50)
plt.show()

In [None]:
Peacock = Peacock.set_index("timestamp")
decomp = seasonal_decompose(Peacock["Location Energy Sum"], period=12)
decomp.plot()

In [None]:
result = adfuller(Peacock["Location Energy Sum"])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

p-value is significant while checking for stationarity 
* This indicates that statistical properities of data, such as standard deviation, mean and variance is constant over time

Trial 1 of SARIMAX Modelling 

In [None]:
endog = train

mod = sm.tsa.statespace.SARIMAX(endog=endog, order=(1,0,1) ,seasonal_order=(1,0, 0, 12))

In [None]:
mod_fit = mod.fit()
mod_fit.summary()

In [None]:
train.plot(figsize=(25,10))
mod_fit.fittedvalues.plot()
plt.show()

In [None]:
#Predict
pred = mod_fit.predict()
test["pred"] = pred.values
test.head()

After predicting from the training set, we can see that the predicted (blue line), is similar to the actual values (red line)

In [None]:
test["Location Energy Sum"].plot(figsize=(25,10),color = 'red')
test["pred"].plot()
plt.show()

In [None]:
test['residual'] = abs(test['Location Energy Sum']-test['pred'])
MAE = test['residual'].sum()/len(test)
MAPE = (abs(test['residual'])/test['Location Energy Sum']).sum()*100/len(test)
print("MAE:", MAE)
print("MAPE:", MAPE)

Trial 2 of SARIMAX Modelling

In [None]:
! pip install pmdarima

In [None]:
pip install --upgrade pip

In [None]:
import pmdarima as pm
smodel = pm.auto_arima(train, start_p=1, start_q=1, test="adf", max_p=3, max_q=3, m=12, d = 1, D = 1, start_P=0, seasonal = True, trace=True,
                      error_action="ignore", suppress_warnings=True, stepwise=True
                      )

smodel.summary()

From the auto arima function, the best model is SARIMAX(0, 1, 1) x (0, 1, [1,2], 12)

In [None]:
mod_1 = sm.tsa.statespace.SARIMAX(train, order=(0,1,1), seasonal_order=(0,1,[1,2],12))
mod_1_fit = mod_1.fit()
mod_1_fit.summary()

In [None]:
train.plot(figsize=(25,10))
mod_1_fit.fittedvalues.plot()
plt.show()

In [None]:
pred_1 = mod_1_fit.predict()
test["pred_1"] = pred_1.values
test.head()

After predicting from the training set, we can see that the predicted (blue line), is similar to the actual values (red line)

In [None]:
test["Location Energy Sum"].plot(figsize=(25,10),color = 'red')
test["pred_1"].plot()
plt.show()

In [None]:
test['residual_1'] = abs(test['Location Energy Sum']-test['pred_1'])
MAE = test['residual_1'].sum()/len(test)
MAPE = (abs(test['residual_1'])/test['Location Energy Sum']).sum()*100/len(test)
print("MAE:", MAE)
print("MAPE:", MAPE)

SARIMAX model produced better results than the 5 basic models done before. Now, will continue to use SARIMAX to model the other locations 

# Moose 

In [None]:
moose_model_data = Moose[["Location Energy Sum"]]
train = moose_model_data.iloc[0:(len(moose_model_data)-53)].copy()
test = moose_model_data.iloc[len(train):(len(moose_model_data) -1)].copy()

In [None]:
sm.graphics.tsa.plot_pacf(train,lags=30)
plt.show()

In [None]:
sm.graphics.tsa.plot_acf(train,lags=50)
plt.show()

In [None]:
decomp = seasonal_decompose(Moose["Location Energy Sum"], period=12)
decomp.plot()

In [None]:
result = adfuller(Moose["Location Energy Sum"])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

p-value is significant while checking for stationarity

* This indicates that statistical properities of data, such as standard deviation, mean and variance is constant over time

In [None]:
import pmdarima as pm
smodel = pm.auto_arima(train, start_p=1, start_q=1, test="adf", max_p=3, max_q=3, m=12, d = 1, D = 1, start_P=0, seasonal = True, trace=True,
                      error_action="ignore", suppress_warnings=True, stepwise=True
                      )

smodel.summary()

From the auto arima function, the best model is SARIMAX(2, 1, 2) x (0, 1, [1], 12)

In [None]:
mod = sm.tsa.statespace.SARIMAX(train, order=(2,1,2), seasonal_order=(0,1,[1],12))
mod_fit = mod.fit()
mod_fit.summary()

In [None]:
pred = mod_fit.predict()
test["pred"] = pred.values
test.head()

In [None]:
test["Location Energy Sum"].plot(figsize=(25,10),color = 'red')
test["pred"].plot()
plt.show()

In [None]:
test['residual'] = abs(test['Location Energy Sum']-test['pred'])
MAE = test['residual'].sum()/len(test)
MAPE = (abs(test['residual'])/test['Location Energy Sum']).sum()*100/len(test)
print("MAE:", MAE)
print("MAPE:", MAPE)

The predictions and metrics above are different than that shows in my notebook at run time. 

# Bull

In [None]:
bull_model_data = Bull[["Location Energy Sum"]]
train = bull_model_data.iloc[0:(len(bull_model_data)-53)].copy()
test = bull_model_data.iloc[len(train):(len(bull_model_data) -1)].copy()

In [None]:
sm.graphics.tsa.plot_pacf(train,lags=30)
plt.show()

In [None]:
sm.graphics.tsa.plot_acf(train,lags=50)
plt.show()

In [None]:
decomp = seasonal_decompose(Bull["Location Energy Sum"], period=12)
decomp.plot()

In [None]:
result = adfuller(Bull["Location Energy Sum"])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

p-value is not significant while checking for stationarity

* This indicates that statistical properities of data, such as standard deviation, mean and variance is not constant over time

In [None]:
smodel = pm.auto_arima(train, start_p=1, start_q=1, test="adf", max_p=3, max_q=3, m=12, d = 1, D = 1, start_P=0, seasonal = True, trace=True,
                      error_action="ignore", suppress_warnings=True, stepwise=True
                      )

smodel.summary()

From the auto arima function, the best model is SARIMAX(1, 1, 1) x (0, 1, [], 12)

In [None]:
mod = sm.tsa.statespace.SARIMAX(train, order=(1,1,1), seasonal_order=(0,1,[],12))
mod_fit = mod.fit()
mod_fit.summary()

In [None]:
pred = mod_fit.predict()
test["pred"] = pred.values
test.head()

In [None]:
test["Location Energy Sum"].plot(figsize=(25,10),color = 'red')
test["pred"].plot()
plt.show()

In [None]:
test['residual'] = abs(test['Location Energy Sum']-test['pred'])
MAE = test['residual'].sum()/len(test)
MAPE = (abs(test['residual'])/test['Location Energy Sum']).sum()*100/len(test)
print("MAE:", MAE)
print("MAPE:", MAPE)

# Hog 

In [None]:
hog_model_data = Hog[["Location Energy Sum"]]
train = hog_model_data.iloc[0:(len(hog_model_data)-53)].copy()
test = hog_model_data.iloc[len(train):(len(hog_model_data) -1)].copy()

In [None]:
sm.graphics.tsa.plot_pacf(train,lags=30)
plt.show()

In [None]:
sm.graphics.tsa.plot_acf(train,lags=50)
plt.show()

In [None]:
decomp = seasonal_decompose(Hog["Location Energy Sum"], period=12)
decomp.plot()

In [None]:
result = adfuller(Hog["Location Energy Sum"])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

p-value is not significant while checking for stationarity

* This indicates that statistical properities of data, such as standard deviation, mean and variance is not constant over time

In [None]:
smodel = pm.auto_arima(train, start_p=1, start_q=1, test="adf", max_p=3, max_q=3, m=12, d = 1, D = 1, start_P=0, seasonal = True, trace=True,
                      error_action="ignore", suppress_warnings=True, stepwise=True
                      )

smodel.summary()

From the auto arima function, the best model is SARIMAX(1, 1, 0) x (0, 1, [1], 12)

In [None]:
mod = sm.tsa.statespace.SARIMAX(train, order=(1,1,0), seasonal_order=(0,1,[1],12))
mod_fit = mod.fit()
mod_fit.summary()

In [None]:
pred = mod_fit.predict()
test["pred"] = pred.values
test.head()

In [None]:
test["Location Energy Sum"].plot(figsize=(25,10),color = 'red')
test["pred"].plot()
plt.show()

In [None]:
test['residual'] = abs(test['Location Energy Sum']-test['pred'])
MAE = test['residual'].sum()/len(test)
MAPE = (abs(test['residual'])/test['Location Energy Sum']).sum()*100/len(test)
print("MAE:", MAE)
print("MAPE:", MAPE)

Hog Modelling and Predictions have major faults. Further correcting and modelling is needed.

# Eagle

In [None]:
eagle_model_data = Eagle[["Location Energy Sum"]]
train = eagle_model_data.iloc[0:(len(eagle_model_data)-53)].copy()
test = eagle_model_data.iloc[len(train):(len(eagle_model_data) -1)].copy()

In [None]:
sm.graphics.tsa.plot_pacf(train,lags=30)
plt.show()

In [None]:
sm.graphics.tsa.plot_acf(train,lags=50)
plt.show()

In [None]:
decomp = seasonal_decompose(Eagle["Location Energy Sum"], period=12)
decomp.plot()

In [None]:
result = adfuller(Eagle["Location Energy Sum"])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

p-value is not significant while checking for stationarity

* This indicates that statistical properities of data, such as standard deviation, mean and variance is not constant over time

In [None]:
smodel = pm.auto_arima(train, start_p=1, start_q=1, test="adf", max_p=3, max_q=3, m=12, d = 1, D = 1, start_P=0, seasonal = True, trace=True,
                      error_action="ignore", suppress_warnings=True, stepwise=True
                      )

smodel.summary()

From the auto arima function, the best model is SARIMAX(0, 1, 0) x (0, 1, [1], 12)

In [None]:
mod = sm.tsa.statespace.SARIMAX(train, order=(0,1,0), seasonal_order=(0,1,[1],12))
mod_fit = mod.fit()
mod_fit.summary()

In [None]:
pred = mod_fit.predict()
test["pred"] = pred.values
test.head()

In [None]:
test["Location Energy Sum"].plot(figsize=(25,10),color = 'red')
test["pred"].plot()
plt.show()

In [None]:
test['residual'] = abs(test['Location Energy Sum']-test['pred'])
MAE = test['residual'].sum()/len(test)
MAPE = (abs(test['residual'])/test['Location Energy Sum']).sum()*100/len(test)
print("MAE:", MAE)
print("MAPE:", MAPE)

Not an ideal metrics.

# Cockatoo

In [None]:
cockatoo_model_data = Cockatoo[["Location Energy Sum"]]
train = cockatoo_model_data.iloc[0:(len(cockatoo_model_data)-53)].copy()
test = cockatoo_model_data.iloc[len(train):(len(cockatoo_model_data) -1)].copy()

In [None]:
sm.graphics.tsa.plot_pacf(train,lags=20)
plt.show()

In [None]:
sm.graphics.tsa.plot_acf(train,lags=50)
plt.show()

In [None]:
decomp = seasonal_decompose(Cockatoo["Location Energy Sum"], period=12)
decomp.plot()

In [None]:
result = adfuller(Eagle["Location Energy Sum"])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

p-value is not significant while checking for stationarity

* This indicates that statistical properities of data, such as standard deviation, mean and variance is not constant over time

In [None]:
smodel = pm.auto_arima(train, start_p=1, start_q=1, test="adf", max_p=3, max_q=3, m=12, d = 1, D = 1, start_P=0, seasonal = True, trace=True,
                      error_action="ignore", suppress_warnings=True, stepwise=True
                      )

smodel.summary()

From the auto arima function, the best model is SARIMAX(0, 1, 2) x (0, 1, [1], 12)

In [None]:
mod = sm.tsa.statespace.SARIMAX(train, order=(0,1,2), seasonal_order=(0,1,[1],12))
mod_fit = mod.fit()
mod_fit.summary()

In [None]:
pred = mod_fit.predict()
test["pred"] = pred.values
test.head()

In [None]:
test["Location Energy Sum"].plot(figsize=(25,10),color = 'red')
test["pred"].plot()
plt.show()

In [None]:
test['residual'] = abs(test['Location Energy Sum']-test['pred'])
MAE = test['residual'].sum()/len(test)
MAPE = (abs(test['residual'])/test['Location Energy Sum']).sum()*100/len(test)
print("MAE:", MAE)
print("MAPE:", MAPE)

Metrics are not ideal.