In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import os 

# Data visualization
import seaborn as sns
from matplotlib import pyplot as plt
from plotly import graph_objects as go 
from plotly import express as px
from ipywidgets import widgets

# Stats
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


# Working directory
os.chdir(os.path.realpath(os.path.join(os.getcwd(), os.pardir)))

from src.data.manage_data import DataLoader, DataSaver, _project_directory                         

# Introduction

The following notebook was created for introduce myself into Hourly Energy Consumption Dataset downloaded from kaggle (https://www.kaggle.com/robikscube/hourly-energy-consumption). In the notebook you can find a few time series visualization and basic test statistics for time series analysis. Moreover I add autocorrelation and partial autocorrelation plot to find appropriate lags value for autoregressive models during forecasting.

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load dataset

I use my own class for data loading. In src/data/data_preprocess.py you can find script which data preprocessed was performed.

In [3]:
data_model = DataLoader(processed=True, name='ALL_hourly')
df = data_model.load_data()

# Gente introduction to Exploratory Data Analysis

Below you can find basic intrudction to my dataset. It's nice to know more about our analyse data right ?

In [4]:
print('Data dimension:\n', df.shape)
print('-'*20)
print('Columns data names:\n', df.columns)
print('-'*20)
print('Columns data types:\n', df.dtypes)
print('-'*20)

Data dimension:
 (1090167, 3)
--------------------
Columns data names:
 Index(['Datetime', 'Value', 'Name'], dtype='object')
--------------------
Columns data types:
 Datetime     object
Value       float64
Name         object
dtype: object
--------------------


In [5]:
df.head()

Unnamed: 0_level_0,Datetime,Value,Name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2011-12-31 01:00:00,6222.0,FE_MW
1,2011-12-31 02:00:00,5973.0,FE_MW
2,2011-12-31 03:00:00,5778.0,FE_MW
3,2011-12-31 04:00:00,5707.0,FE_MW
4,2011-12-31 05:00:00,5691.0,FE_MW


In [6]:
df.dtypes

Datetime     object
Value       float64
Name         object
dtype: object

**Quick conclusions**:
    
    1. Datetime column has object type -> we have to convert it to 'datetime' type
    2. Datetime column could be splitted to smaller parts -> we can split it into year, month, day, week, weekday, hours, daypart etc. 

# Preprocessing

Again some basic manipulation. In visualization I would like to explore some patterns on such aggregation and its why I add the following variables.

In [7]:
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Date'] = pd.to_datetime(df['Datetime'].dt.date)
df['Year'] = df['Datetime'].dt.year
df['Quarter'] = df['Datetime'].dt.quarter
df['Month'] = df['Datetime'].dt.month
df['Day'] = df['Datetime'].dt.day
df['Week'] = df['Datetime'].dt.isocalendar().week
df['Weekday'] = df['Datetime'].dt.weekdaya
df['Hour'] = df['Datetime'].dt.hour
df['WeekDate'] = pd.to_datetime(df['Week'].astype(str) + df['Year'].astype(str).add('-1'), format='%V%G-%u')

In [8]:
df.dtypes

Datetime    datetime64[ns]
Value              float64
Name                object
Date        datetime64[ns]
Year                 int64
Quarter              int64
Month                int64
Day                  int64
Week                UInt32
Weekday              int64
Hour                 int64
WeekDate    datetime64[ns]
dtype: object

In [9]:
df.head()

Unnamed: 0_level_0,Datetime,Value,Name,Date,Year,Quarter,Month,Day,Week,Weekday,Hour,WeekDate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2011-12-31 01:00:00,6222.0,FE_MW,2011-12-31,2011,4,12,31,52,5,1,2011-12-26
1,2011-12-31 02:00:00,5973.0,FE_MW,2011-12-31,2011,4,12,31,52,5,2,2011-12-26
2,2011-12-31 03:00:00,5778.0,FE_MW,2011-12-31,2011,4,12,31,52,5,3,2011-12-26
3,2011-12-31 04:00:00,5707.0,FE_MW,2011-12-31,2011,4,12,31,52,5,4,2011-12-26
4,2011-12-31 05:00:00,5691.0,FE_MW,2011-12-31,2011,4,12,31,52,5,5,2011-12-26


# Test statistics

In this chapter you can find some basic time series statistics like: 
    
    1. Dickey-Fuller and KPSS test for stationarity 
    2. Autocorrelation and Partial Autocorrelation Plot
    3. ...

## Dickey-Fuller and KPSS test

In [33]:
@widgets.interact(name=widgets.Dropdown(options=np.sort(df.Name.unique())), time=widgets.Dropdown(options = ['Date', 'WeekDate', 'Year', 'Quarter', 'Month', 'Day', 'Week', 'Weekday', 'Hour']))
def ts_plot(name, time):
    df_ts = df[df['Name'] == name].groupby([time]).agg(dict(Value = np.mean)).reset_index()
     
    dftest = adfuller(df_ts.Value.dropna(), regression='c', autolag='AIC')
    kpsstest = kpss(df_ts.Value.dropna(), regression='c', nlags=dftest[2])
    
    # ADF Test
    adf_results = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'Lags Used', 'Number of observations'])

    for key, value in dftest[4].items():
        adf_results[f'Critical Value ({key})'] = value 

    if dftest[1] <= 0.05:
        adf_results['Stationarity test result'] = 'Time series is STATIONARITY.'
    else:        
        adf_results['Stationarity test result'] = 'Time series is NOT STATIONARITY.'
    
    # KPSS Test
    kpss_results = pd.Series(kpsstest[0:3], index=['Test Statistic', 'p-value', 'Lags Used'])

    for key, value in dftest[4].items():
        kpss_results[f'Critical Value ({key})'] = value
    
    if kpsstest[1] <= 0.05:
        kpss_results['Stationarity test result'] = 'Time series is STATIONARITY.'
    else:        
        kpss_results['Stationarity test result'] = 'Time series is NOT STATIONARITY.'
    
    print('Results of Dickey-Fuller Test:')
    print('H0: The series has a unit root')
    print('HA: The series has no unit root')
    print('\n')
    print(adf_results)
    print('\n')
    print('Results of KPSS Test:')
    print('H0: The process is trend stationary')
    print('HA: The process has a unit root (series is not stationary)')
    print('\n')
    print(kpss_results)

interactive(children=(Dropdown(description='name', options=('AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM…

## Autocorrelation and Partial Autocorrelation 

In [38]:
@widgets.interact(name=widgets.Dropdown(options=np.sort(df.Name.unique())), time=widgets.Dropdown(options = ['Date', 'WeekDate']))
def correlation_plot(name, time):
    df_ts = df[df['Name'] == name].groupby([time]).agg(dict(Value = np.mean)).reset_index()
    plt.figure(figsize=(8, 12))
    dfacf = plot_acf(df_ts.Value, lags=50, alpha=0.05)

    plt.figure(figsize=(102, 18))
    dfacf = plot_pacf(df_ts.Value, lags=50, alpha=0.05)

interactive(children=(Dropdown(description='name', options=('AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM…

# Visualization

## Time series by date

In [35]:
@widgets.interact(name=widgets.Dropdown(options=np.sort(df.Name.unique())), time=widgets.Dropdown(options = ['Date', 'WeekDate', 'Year', 'Quarter', 'Month', 'Day', 'Week', 'Weekday', 'Hour']))
def ts_plot(name, time):
    df_ts = df[df['Name'] == name].groupby([time]).agg(dict(Value = np.mean)).reset_index()
    fig = px.line(data_frame=df_ts, x=time, y ='Value')
    fig.update_layout(   
        template='ggplot2',
        title=dict(text=f'Energy consumption in megawatts (MW) from {name} power station')
    )
    fig.update_traces(
        line=dict(color='darkblue', width=1)
        
    )
    fig.show()

interactive(children=(Dropdown(description='name', options=('AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM…

## Seasonal decomposition using moving average method

In [37]:
@widgets.interact(name=widgets.Dropdown(options=np.sort(df.Name.unique())), time=widgets.Dropdown(options = ['Date', 'WeekDate']))
def sesonal_decomposistion(name, time):
    df_ts = df[df['Name'] == name].groupby([time]).agg(dict(Value = np.mean)).reset_index()
    data = seasonal_decompose(x=df_ts.set_index(time), model='additive', period=np.int(df_ts.shape[0]/(len(df_ts[time].dt.year.unique()))))
    
    fig1 = go.Figure()
    fig1.add_trace(go.Scatter(x=df_ts[time], y=data.observed, name='Observed', mode='lines'))
    fig1.add_trace(go.Scatter(x=df_ts[time], y=data.seasonal, name='Seasonality', mode='lines'))
    fig1.add_trace(go.Scatter(x=df_ts[time], y=data.trend, name='Trend', mode='lines'))
    
    
    fig1.update_layout(   
        template='ggplot2',
        title=dict(text=f'Time series decomposition using moving averagee for {name} power station')
    )
    fig1.show()
    
    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=df_ts[time], y=data.resid, name='Residuals', mode='lines'))
    fig2.add_trace(go.Scatter())
    
    fig2.update_layout(   
        template='ggplot2',
        title=dict(text=f'Energy consumption in megawatts (MW) from {name} power station'))
    fig2.show()
    

interactive(children=(Dropdown(description='name', options=('AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM…