In [68]:
# Data manipulation
import pandas as pd
import numpy as np
import os 

# Data visualization
import seaborn as sns
from matplotlib import pyplot as plt
from plotly import graph_objects as go 
from plotly import express as px
from statsmodels.tsa.seasonal import seasonal_decompose
from ipywidgets import widgets



# Working directory
os.chdir(os.path.realpath(os.path.join(os.getcwd(), os.pardir)))

from src.data.manage_data import DataLoader, DataSaver, _project_directory                         

In [70]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Load dataset

In [2]:
data_model = DataLoader(processed=True, name='ALL_hourly')
df = data_model.load_data()

# Gente introduction to Exploratory Data Analysis

In [4]:
print('Data dimension:\n', df.shape)
print('-'*20)
print('Columns data names:\n', df.columns)
print('-'*20)
print('Columns data types:\n', df.dtypes)
print('-'*20)

Data dimension:
 (1090167, 3)
--------------------
Columns data names:
 Index(['Datetime', 'Value', 'Name'], dtype='object')
--------------------
Columns data types:
 Datetime     object
Value       float64
Name         object
dtype: object
--------------------


In [5]:
df.head()

Unnamed: 0_level_0,Datetime,Value,Name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2011-12-31 01:00:00,6222.0,FE_MW
1,2011-12-31 02:00:00,5973.0,FE_MW
2,2011-12-31 03:00:00,5778.0,FE_MW
3,2011-12-31 04:00:00,5707.0,FE_MW
4,2011-12-31 05:00:00,5691.0,FE_MW


In [8]:
df.dtypes

Datetime     object
Value       float64
Name         object
dtype: object

**Quick conclusions**:
    
    1. Datetime column has object type -> we have to convert it to 'datetime' type
    2. Datetime column could be splitted to smaller parts -> we can split it into year, month, day, week, weekday, hours, daypart etc. 

# Preprocessing

In [22]:
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Date'] = pd.to_datetime(df['Datetime'].dt.date)
df['Year'] = df['Datetime'].dt.year
df['Quarter'] = df['Datetime'].dt.quarter
df['Month'] = df['Datetime'].dt.month
df['Day'] = df['Datetime'].dt.day
df['Week'] = df['Datetime'].dt.isocalendar().week
df['Weekday'] = df['Datetime'].dt.weekday
df['Hour'] = df['Datetime'].dt.hour

In [10]:
df.dtypes

Datetime    datetime64[ns]
Value              float64
Name                object
Date        datetime64[ns]
Year                 int64
Quarter              int64
Month                int64
Day                  int64
Week                UInt32
Weekday              int64
Hour                 int64
dtype: object

In [11]:
df.head()

Unnamed: 0_level_0,Datetime,Value,Name,Date,Year,Quarter,Month,Day,Week,Weekday,Hour
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2011-12-31 01:00:00,6222.0,FE_MW,2011-12-31,2011,4,12,31,52,5,1
1,2011-12-31 02:00:00,5973.0,FE_MW,2011-12-31,2011,4,12,31,52,5,2
2,2011-12-31 03:00:00,5778.0,FE_MW,2011-12-31,2011,4,12,31,52,5,3
3,2011-12-31 04:00:00,5707.0,FE_MW,2011-12-31,2011,4,12,31,52,5,4
4,2011-12-31 05:00:00,5691.0,FE_MW,2011-12-31,2011,4,12,31,52,5,5


# Visualization

#### Time series by date

In [104]:
@widgets.interact(name=widgets.Dropdown(options=np.sort(df.Name.unique())), time=widgets.Dropdown(options = ['Date', 'Year', 'Quarter', 'Month', 'Day', 'Week', 'Weekday', 'Hour']))
def ts_plot(name, time):
    df_ts = df[df['Name'] == name].groupby([time]).agg(dict(Value = np.mean)).reset_index()
    fig = px.line(data_frame=df_ts, x=time, y ='Value')
    fig.update_layout(   
        template='ggplot2',
        title=dict(text=f'Energy consumption in megawatts (MW) from {name} power station')
    )
    fig.update_traces(
        line=dict(color='darkblue', width=1)
        
    )
    fig.show()

interactive(children=(Dropdown(description='name', options=('AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM…

In [105]:
@widgets.interact(name=widgets.Dropdown(options=np.sort(df.Name.unique())))
def sesonal_decomposistion(name):
    df_ts = df[df['Name'] == name].groupby(['Date']).agg(dict(Value = np.mean)).reset_index()
    data = seasonal_decompose(x=df_ts.set_index('Date'), model='additive', period=np.int(df_ts.shape[0]/(len(df_ts.Date.dt.year.unique()))))
    
    fig1 = go.Figure()
    fig1.add_trace(go.Line(x=df_ts.index, y=data.observed, name='Observed'))
    fig1.add_trace(go.Line(x=df_ts.index, y=data.seasonal, name='Seasonality'))
    fig1.add_trace(go.Line(x=df_ts.index, y=data.trend, name='Trend'))
    
    
    fig1.update_layout(   
        template='ggplot2',
        title=dict(text=f'Time series decomposition using moving averagee for {name} power station')
    )
    fig1.show()
    
    fig2 = go.Figure()
    fig2.add_trace(go.Line(x=df_ts.index, y=data.resid, name='Residuals'))
    
    fig2.update_layout(   
        template='ggplot2',
        title=dict(text=f'Energy consumption in megawatts (MW) from {name} power station')
    )
    fig2.show()

interactive(children=(Dropdown(description='name', options=('AEP_MW', 'COMED_MW', 'DAYTON_MW', 'DEOK_MW', 'DOM…

#### Time series by year

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data=data.groupby(['Year']).agg(dict(PJME_MW = np.mean)))

#### Time series by quarter

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data=data.groupby(['Quarter']).agg(dict(PJME_MW = np.mean)))

#### Time series by month

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data=data.groupby(['Month']).agg(dict(PJME_MW = np.mean)))

#### Time series by week

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data=data.groupby(['Week']).agg(dict(PJME_MW = np.mean)))

#### Time series by day

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data=data.groupby(['Day']).agg(dict(PJME_MW = np.mean)))

#### Time series by weekday

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data=data.groupby(['Weekday']).agg(dict(PJME_MW = np.mean)))

#### Time series by hour

In [None]:
plt.figure(figsize = (12, 8))
sns.lineplot(data=data.groupby(['Hour']).agg(dict(PJME_MW = np.mean)))