In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Getting and Knowing Data

In [None]:
data = pd.read_csv(r'/kaggle/input/covid19s-impact-on-airport-traffic/covid_impact_on_airport_traffic.csv')

In [None]:
data.head(2)

In [None]:
data.tail(2)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data['Date'] = data['Date'].astype('datetime64[ns]')

In [None]:
data.describe(include='all')

There are 4 countries, 23 states,27 cities and 28 airports in the dataset. Data has been collected for 262 distinct days starting from 16/03/20 to 02/12/20.

Centroid and Country are POINT and POLYGON structures, telling us that they are geographical locations.

Also, from count we can see, that all features have no missing value.

In [None]:
data.duplicated().sum() #No duplicates are present.

## Removing features that aren't important

The geography is a polygon feature meaning it resembles shape. So, we can conclude that geography tells us the shape of an airport. Since, we don't require it for our analysis. We'll drop it.

Same reasoning applies for centroid. Centroid probably tells us the latitude and longitude of the centre of the airport. Since, we don't have any use for that. We'll drop it too.

ISO_3166_2 is some unique value for every state. We won't be requiring it for time series analysis.

AggregationMethod is always 'Daily', so, it doesn't provide any information. We can remove it.

No information on version is provided. So, we'll leave that from our analysis.

In [None]:
data = data.drop(columns = ['AggregationMethod','Version','Centroid','ISO_3166_2','Geography'])

In [None]:
data.head()

# Univariate Analysis

In [None]:
plt.figure(figsize=(20,6))
fig1 = sns.countplot(x = 'AirportName', data = data , palette='rainbow_r')
fig1.set_xticklabels(fig1.get_xticklabels(), rotation=90)
fig1.set_title("Count for various Airports")
plt.show();
# This shows that there are around 250 data points on each airport except Santiago International Airport and Edmonton International

In [None]:
plt.figure(figsize=(20,6))
fig2 = sns.countplot(x = 'City', data = data , palette='viridis')
fig2.set_xticklabels(fig2.get_xticklabels(), rotation=90)
fig2.set_title("Count for various City")
plt.show();
# This shows that all cities have more or less equal counts in data except New York. The most likely reason would be that it has more airports.

In [None]:
plt.figure(figsize=(20,6))
fig3 = sns.countplot(x = 'State', data = data , palette='cividis')
fig3.set_xticklabels(fig3.get_xticklabels(), rotation=90)
fig3.set_title("Count for various State")
plt.show();
# Here, all states have equal counts in data except Alberta,Quebec,California and New York. Again most likely reason must be the number of airports. We'll come again on this in in-depth analysis of countries.

In [None]:
plt.figure(figsize=(8,4))
fig4 = sns.countplot(x = 'Country', data = data , palette='summer')
fig4.set_xticklabels(fig4.get_xticklabels())
fig4.set_title("Count for various Country")
plt.show();
# Maximum data points are for US followed by Canada. This is because the number of airports in US and Canada are probably more than Australia and Chile.

In [None]:
data.groupby("Country")[['State','City','AirportName']].nunique()

## Distribution of PercentOfBaseline

In [None]:
sns.kdeplot(data['PercentOfBaseline'],shade=True)
plt.title("Distribution of Percent of Baseline for full data")
plt.show();

# Bivariate Analysis

In [None]:
fig5 = sns.pairplot(data,hue='Country',height=5,palette='husl',aspect=1)
fig5._legend.remove()
plt.title("Distribution of Percent of Baseline for different countries")
plt.legend(loc = 'upper right',bbox_to_anchor=(1.2, 0., 0.5, 0.5))
plt.show();

# Analysis For CHILE

In [None]:
data_chile = data[data['Country']=='Chile']

In [None]:
data_chile.head()

In [None]:
data_chile.info()

In [None]:
data_chile.nunique()

Data on Chile is based on only one state which has one city with one airport named Santiago International airport

In [None]:
data_chile = data_chile.sort_values(by="Date")

In [None]:
data_chile.set_index('Date',inplace=True)

In [None]:
data_chile.head()

In [None]:
data_chile = data_chile.drop(columns=['AirportName','City','State','Country'])

In [None]:
data_chile.head()

In [None]:
plt.figure(figsize=(20,10))
plt.plot(data_chile['PercentOfBaseline'])
plt.title("Plot for PercentOfBaseline Vs Time for Chile")
plt.show();

In [None]:
from statsmodels.tsa.stattools import adfuller
print('Results of Dickey-Fuller Test:')
dftest = adfuller(data_chile['PercentOfBaseline'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)

Dickey-Fuller Test: This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.

We can conclude that our data is not stationary, hence, we need to make it stationary because all timeseries models are stationary.

In [None]:
#define function for kpss test
from statsmodels.tsa.stattools import kpss
#define KPSS
print ('Results of KPSS Test:')
kpsstest = kpss(data_chile['PercentOfBaseline'], regression='c')
kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
for key,value in kpsstest[3].items():
    kpss_output['Critical Value (%s)'%key] = value
print (kpss_output)

The authors of the KPSS test have defined the null hypothesis as the process is trend stationary, to an alternate hypothesis of a unit root series.

KPSS test also suggests that our series is NOT stationary.
https://www.analyticsvidhya.com/blog/2018/09/non-stationary-time-series-python/

Please see this to understand the warning https://stats.stackexchange.com/questions/286124/kpss-test-in-python-with-statsmodels/286167

In [None]:
data_chile['diff'] = data_chile['PercentOfBaseline'] - data_chile['PercentOfBaseline'].shift(1)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(data_chile['diff'])
plt.title("Plot for lagged PercentOfBaseline Vs Time for Chile")
plt.show();


In [None]:
from statsmodels.tsa.stattools import adfuller
print('Results of Dickey-Fuller Test:')
dftest = adfuller(data_chile['diff'].dropna(), autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)

We reject null hypothesis and say series has become stationary now.

In [None]:
#define function for kpss test
from statsmodels.tsa.stattools import kpss
#define KPSS
print ('Results of KPSS Test:')
kpsstest = kpss(data_chile['diff'].dropna(), regression='c')
kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
for key,value in kpsstest[3].items():
    kpss_output['Critical Value (%s)'%key] = value
print (kpss_output)

## Now, we have made the series to be strictly stationary, so we move onto modelling. Since, there is no seasonal component, we can use ARIMA Model. Let's first find out value of p, d and q

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(x=data_chile['PercentOfBaseline'].dropna(),model='multiplicative',period=9)
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.figure(figsize=(10,10))
plt.subplot(411)
plt.plot(data_chile['PercentOfBaseline'], label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout();

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
plot_acf(data_chile['diff'].dropna(),zero=False)
plt.xlim(0,20)
plt.xticks(np.arange(0,20,1))
plt.show(); #q=1 or q=0

In [None]:
plot_pacf(data_chile['diff'].dropna(),zero=False,lags=40,method='ols',alpha=0.05)
plt.xticks(np.arange(0,40,2))
plt.show(); # p=3,5

In [None]:
df = pd.DataFrame(data_chile['diff'])

In [None]:
df.dropna(inplace=True)

In [None]:
from statsmodels.tsa.arima_model import ARMA

In [None]:
# Create Training and Test
train = df.iloc[:212]
test = df.iloc[212:]
# Build Model
model = ARMA(train, order=(6,0))  
fitted = model.fit()  
print(fitted.summary())

# Forecast
fc, se, conf = fitted.forecast(25, alpha=0.05);  # 95% conf

The warning is given letting the user know that the index is not a date/time index. It won't have any effect on predictions.

In [None]:
# Make as pandas series
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual',color='r')
plt.plot(fc_series, label='forecast',color='g')
plt.fill_between(lower_series.index, lower_series, upper_series,color='g', alpha=.05)
plt.title('Forecast vs Actuals')
plt.legend(loc='best', fontsize=8)
plt.show()


# Analysis for USA

In [None]:
data_US = data[data['Country']=='United States of America (the)']

In [None]:
data_US.shape

In [None]:
df1 = pd.DataFrame(data_US.groupby('Date',as_index=True)['PercentOfBaseline'].mean())

In [None]:
df1.head()

In [None]:
plt.figure(figsize=(20,10))
plt.plot(df1)
plt.title("Plot of USA's average PercentOfBaseline Vs Time")
plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller
print('Results of Dickey-Fuller Test:')
dftest = adfuller(df1['PercentOfBaseline'], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
    dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)

We accept reject hypothesis and say that series is stationary.

In [None]:
#define function for kpss test
from statsmodels.tsa.stattools import kpss
#define KPSS
print ('Results of KPSS Test:')
kpsstest = kpss(df1['PercentOfBaseline'].dropna(), regression='c')
kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
for key,value in kpsstest[3].items():
    kpss_output['Critical Value (%s)'%key] = value
print (kpss_output)

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(x=df1['PercentOfBaseline'],model='multiplicative',period=9)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.figure(figsize=(10,10))
plt.subplot(411)
plt.plot(df1['PercentOfBaseline'], label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
plot_acf(df1['PercentOfBaseline'],zero=False)
plt.xlim(0,10)
plt.xticks(np.arange(0,10,1))
plt.show() #q=2

In [None]:
plot_pacf(df1['PercentOfBaseline'],lags=20,zero=False)
plt.xticks(np.arange(0,20,1))
plt.show() # p=1

In [None]:
from statsmodels.tsa.arima_model import ARMA

In [None]:
# Create Training and Test
size = int(len(df1['PercentOfBaseline'])*0.8)
train = df1['PercentOfBaseline'].iloc[:size]
test = df1['PercentOfBaseline'].iloc[size:]
# Build Model
model = ARMA(train, order=(1,2))  
fitted = model.fit() 
print(fitted.summary())

# Forecast
fc, se, conf = fitted.forecast(len(test), alpha=0.05)  # 95% conf

# Make as pandas series
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

In [None]:
# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(train, label='training')
plt.plot(test, label='actual',color='r')
plt.plot(fc_series, label='forecast',color='g')
plt.fill_between(lower_series.index, lower_series, upper_series,color='g', alpha=.05)
plt.title('Forecast vs Actuals')
plt.legend(loc='best', fontsize=8)
plt.show()

# Hope you all liked it!
# I'd appreciate if you'll leave comments below for any suggestions.