In [None]:
"""Importing required libraries"""
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
plt.rcParams['figure.figsize'] = (10,10)
plt.style.use('ggplot')

# Data

In [None]:
data = pd.read_csv('/kaggle/input/population-time-series-data/POP.csv')
data

Let's ignore the real-time data as we are concentratting only on the date range which the population varies.

In [None]:
data = data.drop(['realtime_start','realtime_end'],axis=1)

In [None]:
"""Converting the date column into datetime object and setting it as index"""
data['date'] = pd.to_datetime(data['date'])
data.set_index('date',inplace=True)
data.head()

In [None]:
data.describe()

# Data Visualisation

In [None]:
data.plot()

In [None]:
pd.plotting.autocorrelation_plot(data['value'])

Autocorrelation hits a 99% confidence level almost at 280 lag points.

In [None]:
data['value'].corr(data['value'].shift(300))

# Seasonality?????

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
decomposed = seasonal_decompose(data['value'])
x = decomposed.plot()

There is a positive trend from 1952 to 2019.

## ADF Test

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
print('Significance level : 0.05')
adf = adfuller(data['value'])
print(f'ADF test static is {adf[1]}')

The test ststic is far above critical level, implies, data is non-stationary

Making the data stationary

## 1st Difference

In [None]:
data['stationary'] = data['value'].diff()

In [None]:
data['stationary'].plot()

In [None]:
print('Significance level : 0.05')
adf = adfuller(data['stationary'].dropna())
print(f'ADF test static is {adf[1]}')

## 2nd Difference

In [None]:
data['stationary2'] = data['stationary'].diff()

In [None]:
data['stationary2'].plot()

In [None]:
print('Significance level : 0.05')
adf = adfuller(data['stationary2'].dropna())
print(f'ADF test static is {adf[1]}')

ADF ststic implies, the data become stationary. We can now proceed to modelling the forcast.

In [None]:
t = seasonal_decompose(data['stationary2'].dropna())
x = t.plot()

# Forcasting

## Auto-Regression

In [None]:
from statsmodels.tsa.ar_model import AR

In [None]:
"""Creating train & Test dataset"""

X = data['stationary2'].dropna()

train_df,test_df = X[1:(len(X)-280)],X[(len(X)-280):]

In [None]:
"""Training the model"""

model = AR(train_df)
model_fitted = model.fit()

In [None]:
print(f'The chosen lag value is {model_fitted.k_ar}',end='\n \n')

print(f'The coefficents are {model_fitted.params}')

In [None]:
"""Make predictions"""

predictions = model_fitted.predict(start=len(train_df),
                                   end=len(train_df)+len(test_df)-1,
                                   dynamic=False)

In [None]:
"""Comparing with actual data"""

compare_df = pd.concat([test_df,predictions],axis=1).rename(columns={'stationary2': 'actual', 0:'predicted'})

In [None]:
compare_df.plot()

## ARIMA

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

In [None]:
data = pd.read_csv('/kaggle/input/population-time-series-data/POP.csv')
data = data.drop(['realtime_start','realtime_end'],axis=1)
data

## Identifiying differencing term (d)

In [None]:
fig,axes = plt.subplots(3,2)

x = axes[0,0].plot(data['value']); axes[0,0].set_title('Original Series')
a = plot_acf(data['value'].values,ax=axes[0,1])

y = axes[1,0].plot(data['value'].diff()); axes[1,0].set_title('1st Difference')
b = plot_acf(data['value'].diff().dropna(),ax=axes[1,1])

z = axes[2,0].plot(data['value'].diff().diff()); axes[2,0].set_title('2nd Difference')
c = plot_acf(data['value'].diff().diff().dropna(),ax=axes[2,1])

From the figure it can conlude that 2nd difference made the data stationary. But, ACF tends to go towards negative values implies it is over differenced. Hence, We will go along with 1st difference eventhough it is a little non-stationary.

## Identifying AR term (p)

In [None]:
plt.rcParams.update({'figure.figsize':(9,3),'figure.dpi':120})

fig,axes = plt.subplots(1,2)

a = axes[0].plot(data['value'].diff()); axes[0].set_title('1st Difference')
b = plot_pacf(data['value'].diff().dropna(),ax=axes[1])

plt.show()

Lag difference 1 has fairly significant value compared to others lags. Hence I am going along with taking the p term as 1.

## Identifying the MA term (q)

In [None]:
fig,axes = plt.subplots(1,2)

a = axes[0].plot(data['value'].diff()); axes[0].set_title('1st Difference')
b = plot_acf(data['value'].diff().dropna(),ax=axes[1])

All the values are above the significance level. And I am tentatively going along with a value of 2 for q.

""If your series is slightly under differenced, adding one or more additional AR terms usually makes it up. Likewise, if it is slightly over-differenced, try adding an additional MA term.""

## Building ARIMA model

In [None]:
from statsmodels.tsa.arima_model import ARIMA

In [None]:
model = ARIMA(data['value'].diff().dropna(),(1,1,2))
model_fit = model.fit(disp=0)
print(model_fit.summary())

Here MA2 have a p-value which is way below the significant level. And AR value is also good.Hence we may move along with that.

In [None]:
residuals = pd.DataFrame(model_fit.resid)

fig,axes = plt.subplots(1,2)

residuals.plot(title='Residuals',ax= axes[0])
residuals.plot(kind= 'kde', title='Density',ax= axes[1])

There are residual values which requires our consideration.

Let's plot the predicted values.

In [None]:
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':100})

x = model_fit.plot_predict(dynamic=False)
plt.show()

## Out of Time Cross-Validation

In [None]:
#Train & Test Data
train_data = data['value'][:500]
test_data = data['value'][500:]

In [None]:
model1 = ARIMA(train_data,order=(1,1,2))
model_fitted1 = model1.fit(disp= -1)

In [None]:
fc,se,conf = model_fitted1.forecast(316)

In [None]:
fc_series = Series(fc,index=test_data.index)
lower_series = Series(conf[:,0],index=test_data.index)
upper_series = Series(conf[:,1],index=test_data.index)

In [None]:
plt.figure(figsize=(12,5), dpi=100)

plt.plot(train_data,label='Training')
plt.plot(test_data,label='Actual')
plt.plot(fc_series,label='Forcast',color='green')

plt.fill_between(lower_series.index,lower_series,upper_series,color='k',alpha=.15)

plt.title('Actual Vs Forcast')
plt.legend(loc='upper left')

The graphs is self-explanatory, that the model we implemented has predicted with pretty much accuracy.

Conclusion : Implemented time-series forcasting model with AutoRegression and ARIMA with much accuracy.