#  Time Series Forecasting 


# Import libraries and data files

In [None]:
import numpy as np 
import pandas as pd 
from fbprophet import Prophet
import matplotlib.pyplot as plt
import math as math
import seaborn as sns

from datetime import datetime

from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
# Load the data
train =pd.read_csv("/kaggle/input/web-traffic-time-series-forecasting/train_1.csv.zip")

In [None]:
train.head()

##  Missing values

In [None]:
# Check the data
print("Number of data: ", train.shape[0], "\n")

Missing = train[train.isnull().any(axis=1)]
print("Number of records contain 1+ null: ", Missing.shape[0], "\n")

In [None]:
# get the number of missing data points per column
missing_values_count =train.isnull().sum()

# look at the # of missing points in the first ten columns
missing_values_count[0:10]

In [None]:
Missing.iloc[np.r_[0:10, len(Missing)-10:len(Missing)]]

In [None]:
train=train.interpolate()
train.isnull().sum()

##  Data visualization

In [None]:
def plot_time_series(df, row_num, start_col =1, ax=None):
    if ax is None:
            fig = plt.figure(facecolor='w', figsize=(10, 6))
            ax = fig.add_subplot(111)
    else:
        fig = ax.get_figure()
        
    series_title = df.iloc[row_num, 0]
    sample_series = df.iloc[row_num, start_col:]
    sample_series.plot(style=".", ax=ax)
    ax.set_title("Series: %s" % series_title)

fig, axs  = plt.subplots(4,1,figsize=(12,12))
plot_time_series(train, 1, ax=axs[0])
plot_time_series(train, 10, ax=axs[1])
plot_time_series(train, 100, ax=axs[2])
plot_time_series(train, 1005, ax=axs[3])

plt.tight_layout()

 
## Article names 

In [None]:
train_flattened = pd.melt(train[list(train.columns[-50:])+['Page']], id_vars='Page', var_name='date', value_name='Visits')
train_flattened['date'] = train_flattened['date'].astype('datetime64[ns]')
train_flattened['weekend'] = ((train_flattened.date.dt.dayofweek) // 5 == 1).astype(float)

# Median by page
df_median = pd.DataFrame(train_flattened.groupby(['Page'])['Visits'].median())
df_median.columns = ['median']

# Average by page
df_mean = pd.DataFrame(train_flattened.groupby(['Page'])['Visits'].mean())
df_mean.columns = ['mean']

# Max by page
df_mean = pd.DataFrame(train_flattened.groupby(['Page'])['Visits'].max())
df_mean.columns = ['max']

# Merging data
train_flattened = train_flattened.set_index('Page').join(df_mean).join(df_median)

In [None]:
train_flattened.reset_index(drop=False,inplace=True)
train_flattened['weekday'] = train_flattened['date'].apply(lambda x: x.weekday())

In [None]:
# Feature engineering with the date
train_flattened['year']=train_flattened.date.dt.year 
train_flattened['month']=train_flattened.date.dt.month 
train_flattened['day']=train_flattened.date.dt.day

train_flattened.head()

In [None]:
train_flattened.index

In [None]:
page_name=train_flattened.columns

In [None]:
# Extracting language of the page from it's name and adding it to a set so that we only have 
# unique entry and can easily find out the total number of languages in dataset
lang=set()
for k in page_name:
  index=k.find('.wikipedia')
  lang.add(k[index-1:index-3:-1][::-1])
print(lang)

In [None]:
train_flattened.dtypes

In [None]:
train_flattened['date'] = pd.to_datetime(train_flattened['date'])
train_flattened = train_flattened.set_index('date') 

In [None]:
sns.pairplot(train_flattened.dropna(),
            
             x_vars=['weekday','year',
                     'month','day'],
             y_vars='Visits',
             height=5,
             plot_kws={'alpha':0.15, 'linewidth':0}
            )
plt.suptitle('Visit by weekday, year of Month , day')
plt.show()

In [None]:
y=train_flattened['Visits'].resample('W').mean()

In [None]:
y.sort_index(inplace=True)
y

## Stationarity For Sample

In [None]:
from statsmodels.tsa.stattools import adfuller


def test_stationarity(timeseries):
    
    #Determing rolling statistics

    rolmean = pd.Series(timeseries).rolling(window=12).mean()
    rolstd = pd.Series(timeseries).rolling(window=12).std()
    

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

In [None]:
test_stationarity(y)

## ARIMA Model

In [None]:
import itertools

p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))

In [None]:
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(y,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit()
            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue

### Fitting the ARIMA model

In [None]:
import statsmodels.api as sm
mod = sm.tsa.statespace.SARIMAX(y,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 0, 12),
                                enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])

In [None]:
pred = results.get_prediction(start=pd.to_datetime('2017-01-01'), dynamic=False)

y_forecasted = pred.predicted_mean
y_truth = y['2015-01-01':]
mse = ((y_forecasted - y_truth) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))

In [None]:
print('The Root Mean Squared Error of our forecasts is {}'.format(round(np.sqrt(mse), 2)))

In [None]:
#visualizing forecasts
pred_uc = results.get_forecast(steps=100)
pred_ci = pred_uc.conf_int()
ax = y.plot(label='observed', figsize=(14, 7))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.25)
ax.set_xlabel('Date')
ax.set_ylabel('Visit')
plt.legend()
plt.show()