# Dataset: [Electric Production](https://www.kaggle.com/datasets/kandij/electric-production)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../datasets/2/Electric_Production.csv')

In [3]:
df

Unnamed: 0,DATE,IPG2211A2N
0,1/1/1985,72.5052
1,2/1/1985,70.6720
2,3/1/1985,62.4502
3,4/1/1985,57.4714
4,5/1/1985,55.3151
...,...,...
392,9/1/2017,98.6154
393,10/1/2017,93.6137
394,11/1/2017,97.3359
395,12/1/2017,114.7212


In [4]:
df.isna().sum()

DATE          0
IPG2211A2N    0
dtype: int64

In [5]:
import sys
sys.path.insert(0, '/home/shail/bits/PS1/time_series_analysis')

original_stdout = sys.stdout 
	
def outfile(x):
    with open('output.txt', 'w') as f:
        sys.stdout = f
        print(x)
        sys.stdout = original_stdout 

In [6]:
from tsa.dateconverter import DateConverter

# Date Conversion

In [7]:
converter = DateConverter()
df['DATE'] = df['DATE'].apply(lambda x: converter.convert_date(x, infer_format=True))
df

Unnamed: 0,DATE,IPG2211A2N
0,1985-01-01,72.5052
1,1985-02-01,70.6720
2,1985-03-01,62.4502
3,1985-04-01,57.4714
4,1985-05-01,55.3151
...,...,...
392,2017-09-01,98.6154
393,2017-10-01,93.6137
394,2017-11-01,97.3359
395,2017-12-01,114.7212


# Visualizations

In [8]:
from tsa.visualization import *

In [9]:
small_df = pd.DataFrame(df[:15])
small_df

Unnamed: 0,DATE,IPG2211A2N
0,1985-01-01,72.5052
1,1985-02-01,70.672
2,1985-03-01,62.4502
3,1985-04-01,57.4714
4,1985-05-01,55.3151
5,1985-06-01,58.0904
6,1985-07-01,62.6202
7,1985-08-01,63.2485
8,1985-09-01,60.5846
9,1985-10-01,56.3154


## Waterfall Plot

In [10]:
plot_params = waterfall_plot(small_df, 'DATE', 'IPG2211A2N', 'Waterfall plot of Electric Production', remove_time=True)

In [11]:
outfile(plot_params['y'])

## Lag Plot

In [12]:
data = lag_plot(df['IPG2211A2N'], 30, 'Lag plot of electric production', 2)



In [13]:
outfile(data['data'])

Stacked Area Chart, Stream Graph: not applicable because only 1 column

# Properties

In [14]:
from tsa.properties import *

In [15]:
trend = Trend(df, 'IPG2211A2N', 'DATE')

In [16]:
params = trend.plot(remove_time = True)

In [17]:
params.keys()

dict_keys(['title', 'data', 'x'])

In [18]:
outfile(params['data'])

In [19]:
trend.detrend()

Unnamed: 0_level_0,Original,Detrend
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-01-01,72.5052,72.505200
1985-02-01,70.6720,70.307836
1985-03-01,62.4502,61.721873
1985-04-01,57.4714,56.378909
1985-05-01,55.3151,53.858445
...,...,...
2017-09-01,98.6154,-44.136782
2017-10-01,93.6137,-49.502646
2017-11-01,97.3359,-46.144610
2017-12-01,114.7212,-29.123474


In [20]:
p = trend.plot_detrend(remove_time = True)

In [21]:
p.keys()

dict_keys(['title', 'data', 'data_detrend', 'x'])

In [22]:
outfile(p['x'])

## Seasonality

In [23]:
seasonality = Seasonality(df, 'IPG2211A2N', 'DATE')

In [24]:
seasonality.seasonal()

DATE
1985-01-01    1.149172
1985-02-01    1.065875
1985-03-01    0.995985
1985-04-01    0.896027
1985-05-01    0.896724
                ...   
2017-09-01    0.971682
2017-10-01    0.900691
2017-11-01    0.933050
2017-12-01    1.071668
2018-01-01    1.149172
Name: seasonal, Length: 397, dtype: float64

In [25]:
seasonality.deseasonalize()

Unnamed: 0_level_0,Original,Deseasonalized
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-01-01,72.5052,63.093433
1985-02-01,70.6720,66.304199
1985-03-01,62.4502,62.701923
1985-04-01,57.4714,64.140251
1985-05-01,55.3151,61.685735
...,...,...
2017-09-01,98.6154,101.489402
2017-10-01,93.6137,103.935449
2017-11-01,97.3359,104.320168
2017-12-01,114.7212,107.049235


## Stationarity: now integrated with ARIMA

## Autocorrelation

In [26]:
a = Autocorrelation()

In [27]:
a.autocorrelation(df['IPG2211A2N'])

{'title': 'Autocorrelation plot of IPG2211A2N, order = 0',
 'y': [1.0,
  0.8627790617866238,
  0.6364037674539034,
  0.5248745918904854,
  0.6139895258542081,
  0.8028675017499224,
  0.8898654508672444,
  0.7841452348548611,
  0.583544406585739,
  0.48843200162838746,
  0.5944013034553447,
  0.7975360953675413,
  0.9042710203562839,
  0.7819278030400045,
  0.5665174745489479,
  0.46030658636375266,
  0.5535487234071177,
  0.7391845253248007,
  0.8205566527537088,
  0.7169618023155605,
  0.5206332931988101,
  0.42631337825242765,
  0.5275682880891199,
  0.7243959046852616,
  0.8212500345080637,
  0.7031834207592711],
 'x': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25],
 'upper': [0.0,
  0.09836777262866503,
  0.15518355334411682,
  0.17866130290701987,
  0.1930060328448331,
  0.2110612974580558,
  0.23879149770027885,
  0.26897176527752686,
  0.29025042596707085,
  0.30138891917702976,
  0

In [28]:
a.durbin_watson_test(df['IPG2211A2N'], summary = True)

As value of statistic is close to 0, series is positively autocorrelated.


0.831475657889986

## Partial Autocorrelation

In [29]:
p = PartialAutocorrelation()
p.partial_autocorrelation(df['IPG2211A2N'])

{'title': 'Partial Autocorrelation plot of IPG2211A2N, order = 0',
 'y': [1.0,
  0.8649577967911356,
  -0.4309183356984148,
  0.5360891669714629,
  0.5461651026320729,
  0.43123287446348485,
  0.06272201366825482,
  -0.1198272355479154,
  -0.16537794748480292,
  0.10873881447667245,
  0.29664451593461993,
  0.3379097005958561,
  0.3070681461257954,
  -0.4329259856503079,
  -0.17959112178034936,
  -0.16888008340150673,
  0.07237955521802193,
  0.0643766362600018,
  0.0595927259984881,
  0.08761444209250385,
  -0.0930204881459329,
  -0.0977528506132622,
  -0.06428325628203546,
  0.17157157466749864,
  0.10680386368803114,
  -0.09072918147299959],
 'x': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25],
 'upper': [0.0,
  0.09836777262866503,
  0.09836777262866508,
  0.09836777262866503,
  0.09836777262866503,
  0.09836777262866508,
  0.09836777262866507,
  0.09836777262866507,
  0.09836777262866

## Differencing: now integrated with ARIMA

# Models

## ARIMA

In [30]:
from tsa.models import Arima

In [31]:
train_df = df[:int(0.8 * len(df))]
test_df = df[int(0.8 * len(df)):]

df = train_df

In [32]:
model = Arima()
model(df, 'DATE', 'IPG2211A2N')

In [33]:
model.test_stationarity()

1. ADF :  -1.7386950117875237
2. P-Value :  0.4112939344557697
3. Num Of Lags :  15
4. Num Of Observations Used For ADF Regression and Critical Values Calculation : 301
5. Critical Values :
	 1% :  -3.452263435801039
	 5% :  -2.871190526189069
	 10% :  -2.571911967527952


As p-value is outside the confidence interval of 95%, series is non-stationary.


In [34]:
model.test_stationarity(1)

1. ADF :  -6.352997676450292
2. P-Value :  2.5832755038271256e-08
3. Num Of Lags :  14
4. Num Of Observations Used For ADF Regression and Critical Values Calculation : 301
5. Critical Values :
	 1% :  -3.452263435801039
	 5% :  -2.871190526189069
	 10% :  -2.571911967527952


As p-value is inside the confidence interval of 95%, series is stationary.


In [35]:
params = model.acf_plot(1)

In [36]:
params.keys()

dict_keys(['title', 'y', 'x', 'upper', 'lower'])

In [37]:
outfile(params['lower'])

In [38]:
params = model.pacf_plot(1)

In [39]:
outfile(params['title'])


In [40]:
model.fit((4,1,2))

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                               SARIMAX Results                                
Dep. Variable:             IPG2211A2N   No. Observations:                  317
Model:                 ARIMA(4, 1, 2)   Log Likelihood                -814.548
Date:                Wed, 05 Jul 2023   AIC                           1643.096
Time:                        14:52:13   BIC                           1669.386
Sample:                    01-01-1985   HQIC                          1653.599
                         - 05-01-2011                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.7587      0.058     13.000      0.000       0.644       0.873
ar.L2         -1.0415      0.063    -16.424      0.000      -1.166      -0.917
ar.L3          0.0416      0.063      0.655      0.5



In [41]:
model.error_metrics(test_df, 'IPG2211A2N')

Unnamed: 0,Metric,Value
0,MAPE,0.032615
1,MAE,3.416214
2,MSE,20.550778


In [42]:
params = model.plot_forecast(test_df, 'IPG2211A2N', exclude_time=True)

In [43]:
outfile(params['y_forecast'])