#ARIMA, short for 'AutoRegressive Integrated Moving Average', is a forecasting algorithm based on the idea that the information in the past values of the time series can alone be used to predict the future values.

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import files
uploaded = files.upload()

Saving daily-total-female-births-CA.csv to daily-total-female-births-CA.csv


In [None]:
import io
birth = pd.read_csv(io.BytesIO(uploaded['daily-total-female-births-CA.csv']), index_col=[0], parse_dates=[0])
# first column is indexed and first column contains dates

In [None]:
birth.tail() # no need to remove the last number with this data set

Unnamed: 0_level_0,births
date,Unnamed: 1_level_1
1959-12-27,37
1959-12-28,52
1959-12-29,48
1959-12-30,55
1959-12-31,50


In [None]:
birth.rename(columns = {'births': 'Daily total female births in California, 1959'})

Unnamed: 0_level_0,"Daily total female births in California, 1959"
date,Unnamed: 1_level_1
1959-01-01,35
1959-01-02,32
1959-01-03,30
1959-01-04,31
1959-01-05,44
...,...
1959-12-27,37
1959-12-28,52
1959-12-29,48
1959-12-30,55


Data Transformation

#Normalisation:

Represents a proper consistent structure. Data is within 0 and 1

1. Data is on different scales (height captured in cm or metres etc. causing different values)
2. Because some algorithms work better when data is normalised

In [None]:
birth.head()

Unnamed: 0_level_0,"Daily total female births in California, 1959"
date,Unnamed: 1_level_1
1959-01-01,35
1959-01-02,32
1959-01-03,30
1959-01-04,31
1959-01-05,44


In [None]:
# normalisation = ( x - min) / (max-min)

In [None]:
#first value example for normalisation
value = (birth.iloc[0] - birth.min())/(birth.max()-birth.min())
value

Daily total female births in California, 1959    0.24
dtype: float64

# Apply algorithm

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
birth_arr = birth.values # .values to get Numpy Array from 'birth' DataFrame
birth_arr

In [None]:
scaler = MinMaxScaler(feature_range=(0,1)) # feature_range() scales each feature to a given range
#fit the data
scaler.fit(birth_arr)

In [None]:
#identify what is the minimum value 
scaler.data_range_

array([50.])

In [None]:
birth_normalise = scaler.transform(birth_arr)

In [None]:
birth_normalise

array([[0.24],
       [0.18],
       [0.14],
       [0.16],
       [0.42],
       [0.12],
       [0.44],
       [0.4 ],
       [0.3 ],
       [0.08]])

# Now that the normalisation is done, it is ready to be put into the algorithm.

In [None]:
birth_train = birth_normalise[0:330]
birth_test = birth_normalise[330:365]

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
#create the model
birth_model = ARIMA(birth_train, order = (2,1,2))

In [None]:
birth_model_fit = birth_model.fit()

  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


In [None]:
birth_model_fit.aic # MUCH lower than previously... 

-344.25985425789884

In [None]:
birth_test.size

35

In [None]:
#FORECAST
birth_forecast = birth_model_fit.forecast(steps = 35)[0] #35 steps because the test size is 35

In [None]:
birth_forecast

array([0.46611016, 0.4622752 , 0.46167904, 0.46140412, 0.46125185,
       0.46119457, 0.46121748, 0.46130854, 0.46145756, 0.46165585,
       0.46189604, 0.46217186, 0.46247797, 0.46280985, 0.46316363,
       0.46353604, 0.46392429, 0.46432601, 0.46473918, 0.46516208,
       0.46559327, 0.4660315 , 0.46647571, 0.46692502, 0.46737866,
       0.46783597, 0.46829642, 0.46875953, 0.4692249 , 0.4696922 ,
       0.47016113, 0.47063145, 0.47110296, 0.47157548, 0.47204885])

In [None]:
birth_test

array([[0.56],
       [0.46],
       [0.42],
       [0.58],
       [0.44],
       [0.18],
       [0.46],
       [0.36],
       [0.22],
       [0.2 ],
       [0.26],
       [0.52],
       [0.4 ],
       [0.4 ],
       [0.22],
       [0.32],
       [0.24],
       [0.58],
       [0.48],
       [0.58],
       [0.32],
       [0.34],
       [0.38],
       [0.38],
       [0.6 ],
       [0.32],
       [0.34],
       [0.3 ],
       [0.42],
       [0.22],
       [0.28],
       [0.58],
       [0.5 ],
       [0.64],
       [0.54]])

Normalisation is not giving us the good results

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
np.sqrt(mean_squared_error(birth_test, birth_forecast))

0.14471788568023372

Much smaller error than the error obtained in 1st part of Time Series Analysus. (6.85 vs 0.14)

In [None]:
# re-shape the data first
birth_forecast_reshape = birth_forecast.reshape(len(birth_forecast),1)
birth_forecast_reverse = scaler.inverse_transform(birth_forecast_reshape)
birth_forecast_reverse

array([[46.30550801],
       [46.11375997],
       [46.08395188],
       [46.07020585],
       [46.06259254],
       [46.05972848],
       [46.060874  ],
       [46.06542718],
       [46.07287786],
       [46.08279235],
       [46.09480187],
       [46.10859284],
       [46.12389862],
       [46.14049247],
       [46.15818161],
       [46.17680209],
       [46.19621451],
       [46.21630033],
       [46.23695877],
       [46.25810411],
       [46.27966348],
       [46.30157491],
       [46.32378569],
       [46.34625104],
       [46.36893283],
       [46.39179868],
       [46.41482103],
       [46.43797647],
       [46.46124507],
       [46.48460989],
       [46.50805653],
       [46.53157275],
       [46.55514813],
       [46.57877381],
       [46.60244227]])

In [None]:
birth_test_reshape = birth_test.reshape(len(birth_test), 1)
birth_test_reverse = scaler.inverse_transform(birth_test_reshape)

# Take the mean squared error

In [None]:
np.sqrt(mean_squared_error(birth_test_reverse, birth_forecast_reverse))

7.235894284011686

# Another method besides normalisation we can use is **Standardization**

In [None]:
#Standardisation method

# (x - mean) / Standard deviation
# np.mean()
# np.std


#Standardisation is more robust compared to normalisation (which does not work if values are outside the range)

In [None]:
from sklearn.preprocessing import StandardScaler

Standardisation for data that follows the Gaussian distribution (natural events) and will work well

In [None]:
std_scaler = StandardScaler()
std_scaler = std_scaler.fit(birth_arr)

In [None]:
std_scaler.mean_

array([41.98082192])

In [None]:
std_scaler.var_

array([53.84894727])

In [None]:
std_birth = std_scaler.transform(birth_arr) #I'M NOT SURE ABOUT HOW std_birth, std_scaler and birth_arr are related...?

In [None]:
std_birth[:5]

array([[-0.95130099],
       [-1.36012148],
       [-1.63266846],
       [-1.49639497],
       [ 0.27516045]])

In [None]:
std_birth_train = std_birth[:330]
std_birth_test = std_birth[330:365]

In [None]:
std_birth_arima = ARIMA(std_birth_train, order = (2,1,2))

#Make sure you know how to identify the values of the order inside the ARIMA() function

You need to fit the data

In [None]:
std_birth_arima_fit = std_birth_arima.fit()

See the AIC value

In [None]:
std_birth_arima_fit.aic

918.391559581354

Very high AIC value...

In [None]:
std_birth_forecast = std_birth_arima_fit.forecast(steps = 35)[0]

In [None]:
std_birth_forecast

array([0.58971707, 0.56360195, 0.55933944, 0.55728093, 0.55609353,
       0.55558414, 0.55564623, 0.55619295, 0.55715079, 0.55845745,
       0.56006003, 0.56191368, 0.56398034, 0.56622771, 0.56862839,
       0.57115915, 0.57380027, 0.57653501, 0.57934918, 0.58223074,
       0.58516948, 0.58815672, 0.59118511, 0.59424842, 0.59734135,
       0.60045941, 0.60359879, 0.60675626, 0.60992908, 0.61311491,
       0.61631179, 0.61951805, 0.62273225, 0.6259532 , 0.62917987])

In [None]:
std_birth_test

array([[ 1.22907491],
       [ 0.54770744],
       [ 0.27516045],
       [ 1.3653484 ],
       [ 0.41143395],
       [-1.36012148],
       [ 0.54770744],
       [-0.13366003],
       [-1.08757449],
       [-1.22384798],
       [-0.8150275 ],
       [ 0.95652792],
       [ 0.13888696],
       [ 0.13888696],
       [-1.08757449],
       [-0.40620702],
       [-0.95130099],
       [ 1.3653484 ],
       [ 0.68398093],
       [ 1.3653484 ],
       [-0.40620702],
       [-0.26993352],
       [ 0.00261346],
       [ 0.00261346],
       [ 1.5016219 ],
       [-0.40620702],
       [-0.26993352],
       [-0.54248051],
       [ 0.27516045],
       [-1.08757449],
       [-0.67875401],
       [ 1.3653484 ],
       [ 0.82025443],
       [ 1.77416889],
       [ 1.09280142]])

#Forecast: we can reshape, reverse the forecast values obtained. WHY? ask hassan

In [None]:
std_birth_forecast_reshape = std_birth_forecast.reshape(len(std_birth_forecast), 1)

In [None]:
std_birth_forecast_rev = std_scaler.inverse_transform(std_birth_forecast_reshape) #WHY DO WE USE std_scaler HERE?

In [None]:
std_birth_test_reshape = std_birth_test.reshape(len(std_birth_test))
std_birth_test_rev = std_scaler.inverse_transform(std_birth_test_reshape)

ValueError: ignored

#You can write a function to do the main operations and give you the direct output (log standardisation, standardisation, normalisation)