Importing the necessary libraries and models.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf 
from statsmodels.tsa.arima_model import ARIMA

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Importing the datasets.

In [None]:
df = pd.read_csv('/kaggle/input/population-time-series-data/POP.csv')
df.head()

Removing unnecessary columns.

In [None]:
df.drop(['realtime_start','realtime_end'],axis=1,inplace=True)
df.head()

Check for the datatypes of the dataframe.

In [None]:
df.dtypes

Convert the date column to datetime and set it as the index.

In [None]:
df['date'] = pd.to_datetime(df.date,format='%Y-%m-%d')
df.set_index('date',inplace=True)
df.head()

Plot the data to view it

In [None]:
plt.rcParams['figure.figsize'] = (12,6)
plt.plot(df['value'])

Plotting rolling mean and rolling standard deviation.

In [None]:
rolmean = df.rolling(window=12).mean()
rolstd = df.rolling(window=12).std()

plt.plot(rolmean,color='red',label='Rolling avg')
plt.plot(rolstd,color='black',label='Rolling std')
plt.legend(loc='best')
plt.show()

Defining a function to test the stationarity of the series via the Dickey Fuller test.

In [None]:
def test_stationarity(data):
  rolmean = data.rolling(window=12).mean()
  rolstd = data.rolling(window=12).std()

  plt.plot(data,label='Original data')
  plt.plot(rolmean,color='red',label='Rolling avg')
  plt.plot(rolstd,color='black',label='Rolling std')
  plt.legend(loc='best')
  plt.show()

  print('Results of the Dickey Fuller test:')
  dftest = adfuller(data, autolag='AIC')
  dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])

  for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
  print (dfoutput)

Calculate the moving average of the dataset and check the stationarity after subtracting it from the original dataset. This subtraction is a step in removing the treand of the timeseries

In [None]:
dfMA = df.rolling(window=12).mean()
dfMAdiff = df - dfMA

In [None]:
test_stationarity(dfMAdiff['value'].dropna())

The p-value from the above result shows that the series is still not stationary, we need to shift the values to bring stationarity in the model.

In [None]:
dfMAdiffshift = dfMAdiff - dfMAdiff.shift(1)
test_stationarity(dfMAdiffshift['value'].dropna())

The p-value is much below the significance level of 0.05 so the series has become stationary.

Decompossing the timeseries into its various components.

In [None]:
decomposition = seasonal_decompose(dfMAdiffshift['value'].dropna())

trend =decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411) 
plt.plot(dfMAdiffshift, label='Original') 
plt.legend(loc='best') 
plt.subplot(412) 
plt.plot(trend, label='Trend') 
plt.legend(loc='best') 
plt.subplot(413) 
plt.plot(seasonal,label='Seasonality') 
plt.legend(loc='best') 
plt.subplot(414) 
plt.plot(residual, label='Residuals') 
plt.legend(loc='best') 
plt.tight_layout() 
plt.show()

Plot ACF and PACF plots to get the value of p and q for the ARIMA model.

In [None]:
lag_acf = acf(dfMAdiffshift.dropna(), nlags=12) 
lag_pacf = pacf(dfMAdiffshift.dropna(), nlags=12, method='ols')

plt.plot(lag_acf) 
plt.axhline(y=0,linestyle='--',color='gray') 
plt.axhline(y=-1.96/np.sqrt(len(dfMAdiffshift.dropna())),linestyle='--',color='gray') 
plt.axhline(y=1.96/np.sqrt(len(dfMAdiffshift.dropna())),linestyle='--',color='gray') 
plt.title('Autocorrelation Function') 
plt.show() 
plt.plot(lag_pacf) 
plt.axhline(y=0,linestyle='--',color='gray') 
plt.axhline(y=-1.96/np.sqrt(len(dfMAdiffshift.dropna())),linestyle='--',color='gray') 
plt.axhline(y=1.96/np.sqrt(len(dfMAdiffshift.dropna())),linestyle='--',color='gray') 
plt.title('Partial Autocorrelation Function') 
plt.show()

From the above graph, the values of p and q are found to be 2 and 3 repectively.

Using the ARIMA model for the timeseries prediction.

In [None]:
dfMAdiffshift.dropna(inplace=True)

model = ARIMA(dfMAdiff.dropna(),order=(2,1,3))
result = model.fit()

plt.plot(dfMAdiffshift.dropna(),label='Original')
plt.plot(result.fittedvalues,label='fit')
plt.legend(loc='best')

View the summary of the above test performed.

In [None]:
result.summary()

To check how our data looks with respect to the original dataset, we need to perform some elementary operations to convert it back to the original form.

Check the fitted values or the outcome of the ARIMA model on the given dataset.

In [None]:
predictions_ARIMA_diff = pd.Series(result.fittedvalues,copy = True)
print(predictions_ARIMA_diff.head())

Using the cumsum() function to get the cummulative mean of the result obtained.

In [None]:
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.head()

In [None]:
predictions_ARIMA = pd.Series(dfMAdiff['value'].dropna().iloc[0],index=df.index)
predictions_ARIMA = predictions_ARIMA.add(predictions_ARIMA_diff_cumsum,fill_value=0)
print(predictions_ARIMA.head())

In [None]:
plt.plot(df,color='blue',label='Original data')
plt.plot(predictions_ARIMA,color='red',label='Fit')
plt.show()

The plot shows that this model has failed to capture the seasonality of the model.

Hence, we will use the SARIMAX model for the population prediction. This model is fit for timeseries with seasonality in the data.

In [None]:
modelsarimax = sm.tsa.statespace.SARIMAX(df['value'],order=(2,1,3),seasonal_order=(2,1,3,12)).fit()
resultsarimax = modelsarimax.predict(start='2020-01-01',end='2031-01-01',freq='MS',dynamic=True)
plt.plot(df['value'],color='blue',label='Original data')
plt.plot(resultsarimax,color='red',label='Predicted data')
plt.legend(loc='best')

This model clearly fits very well in the timeseries and gets the prediction for the next 10 years from 2020-01-01.

In [None]:
forecast = pd.Series(resultsarimax)
forecast

These are the predicted population values for the next 10 years.