**Imports**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

from statsmodels.tsa.holtwinters import Holt
from statsmodels.tsa.holtwinters import ExponentialSmoothing

from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


**Loading data**

In [None]:
#(Data is quite large will take time to load)
df = pd.read_csv('../input/web-traffic-time-series-forecasting/train_1.csv.zip')

The dataset consists of approximately 145k time series. Each of these time series represent a number of daily views of a different Wikipedia article, starting from July, 1st, 2015 up until December 31st, 2016.

Checking the Dataframe

In [None]:
df.head()

From ~145k timeseries of pages filtering out the timeseries of the movie **"300 *Rise of an Empire*"**

In [None]:
ts300 = df[df['Page'].str.match('^300') == True]
ts300

Exporting working timeseries.

In [None]:
ts300.to_csv('op.csv',index=True)

Checking presense of null values and counting the timeseries with null values greater than 0.

In [None]:
sum(ts300.isna().sum()> 0)

Dataset contains **No Null values**

Checking the source of all selected timeseries.

In [None]:
ts300['Page'].apply(lambda x: x.split('.')[-1])

Filtering and selecting one copy of all agents.

In [None]:
ts300 = ts300.loc[[48635, 69487, 139186, 116257]]
#ts300['Page'].apply(lambda x: x.split('.')[-1])
ts300

# Lineplot of all the agents 

In [None]:
ts300.set_index('Page').T.plot(figsize=(20,10))

All agents nearly follows same trend.<br> Selecting timeseries of "all_access_all_agents" as it represents traffic from all agents.

In [None]:
df_300 = ts300.loc[[69487]].T
df_300.rename(columns={69487:'300_movie'}, inplace=True)
df_300 = df_300.rename_axis("date")
df_300.drop('Page', inplace=True)
df_300.index = pd.to_datetime(df_300.index)
df_300['300_movie']=df_300['300_movie'].astype('float')
df_300

# Plotting Timeseries of 300 Movie (all_access_all_agents)

In [None]:
df_300.plot(figsize=(10,5))

# Exploring parts of timeseries

In [None]:
df_300['2015'].plot(figsize=(10,5),title="2015-Jul to 2015-Dec")
df_300['2016-07':].plot(figsize=(10,5),title="2016-Jul to 2016-Dec")
df_300['2016':].plot(figsize=(10,5),title="2016")
df_300['2015':'2016-07'].plot(figsize=(10,5),title="2015-07 to 2016-07")

In timeseries of year 2015 and Between 2015-07 to 2016-07 we can see lot of variation.<br>
Selecting timeseries of 2015-07 to 2016-07 as it includes timeseries of 2015 too.

In [None]:
df_cut = df_300['2015':'2016-07']

# Analysing Quarterly variation in timeseries

In [None]:
df_cut['300_movie'].resample('Q').mean().plot(figsize=(10,6), title="Quarterly Plot (using mean)")

Average traffic was continusly decreasing in last two quarters of 2015.<br>
But in first two quarters of 2016 countinous increase in traffic was observed.

In [None]:
df_cut['300_movie'].resample('Q').median().plot(figsize=(10,6), title="Quarterly Plot (using median)")

Median tell diffrent story than the mean, the last two quarter of 2015 was decreasing but the boost in traffic of first two quarters of 2016 was not very large, infact decrease can be observed in traffic of second quarter of 2016.

In [None]:
df_cut['300_movie'].resample('Q').plot(figsize=(20,10), title="Quarterly Plot")

Now, in the bigger picture we notice two spikes in Q2 and Q3 which might have influnced mean to show continous increase in first two quarters of 2016.

# Distribution of traffic in the timeseries

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(a=df_cut['300_movie'])

Average Traffic lies between 0-500

# Distribution of monthly traffic

In [None]:
df_cut['month']=df_cut.index.month_name()
plt.figure(figsize=(10,6))
sns.boxplot(x='month', y='300_movie', data=df_cut)
plt.tight_layout()

Huge outliers can be observed in the June and July.

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='month', y='300_movie', data=df_cut)
plt.tight_layout()

This plot show better picture of how distributed the traffic is.<br>
Large variation can be seen in June.

# Checking Seasonality in by decomposing timeseries

In [None]:
# Multiplicative Decomposition 
result_mul = seasonal_decompose(df_cut['300_movie'], model='multiplicative', extrapolate_trend='freq')

# Additive Decomposition
result_add = seasonal_decompose(df_cut['300_movie'], model='additive', extrapolate_trend='freq')

In [None]:
plt.rcParams.update({'figure.figsize': (15,10)})
result_mul.plot().suptitle('Multiplicative Decompose', fontsize=22)
plt.subplots_adjust(top=0.90)
result_add.plot().suptitle('Additive Decompose', fontsize=22)
plt.subplots_adjust(top=0.90)

Additive model performed better than the Multiplicative as less 'Resid' can be seen in Additive model.

In [None]:
# Extract the Components ----
# Actual Values = Product of (Seasonal * Trend * Resid)
df_reconstructed = pd.concat([result_add.seasonal, result_add.trend, result_add.resid, result_add.observed], axis=1)
df_reconstructed.columns = ['seas', 'trend', 'resid', 'actual_values']
df_reconstructed.head()

# Testing timeseries Stationarity 

In [None]:
# ADF Test
result = adfuller(df_cut['300_movie'].values, autolag='AIC')

print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
    print('Critial Values:')
    print(f'   {key}, {value}')

# KPSS Test
result = kpss(df_cut['300_movie'].values, regression='c')
print('\nKPSS Statistic: %f' % result[0])
print('p-value: %f' % result[1])
for key, value in result[3].items():
    print('Critial Values:')
    print(f'   {key}, {value}')

p-value of ADF statstic is greater than 0.05, accepting null hypothesis that there is a unit root. 

# Detrending the Time Series

In [None]:
from scipy import signal
detrended = signal.detrend(df_cut['300_movie'].values)
plt.plot(detrended)
plt.title('300 Movie Traffic detrended by subtracting the least squares fit', fontsize=12)

# Deseasonalizing the Time Series

In [None]:
# Subtracting the Trend Component.

# Time Series Decomposition
result_mul = seasonal_decompose(df_cut['300_movie'], model='multiplicative', extrapolate_trend='freq')
result_add = seasonal_decompose(df_cut['300_movie'], model='additive', extrapolate_trend='freq')

# Deseasonalize
deseasonalized_add = df_cut['300_movie'] / result_add.seasonal
deseasonalized_mul = df_cut['300_movie'] / result_mul.seasonal

# Plot
fig, axs = plt.subplots(2)
axs[0].plot(deseasonalized_mul)
axs[0].set_title('300_movie traffic Deseasonalized (multiplicative model)', fontsize=16)
axs[1].plot(deseasonalized_add)
axs[1].set_title('300_movie traffic Deseasonalized (additive model) ', fontsize=16)

# Testing for Seasonality

In [None]:
from pandas.plotting import autocorrelation_plot

# Draw Plot
plt.rcParams.update({'figure.figsize':(9,5), 'figure.dpi':120})
autocorrelation_plot(df_cut['300_movie'].tolist())

In [None]:
# Draw Plot
fig, axes = plt.subplots(1,2,figsize=(16,3), dpi= 100)
plot_acf(df_cut['300_movie'].tolist(), lags=50, ax=axes[0])
plot_pacf(df_cut['300_movie'].tolist(), lags=50, ax=axes[1])

# Forcasting with Holt Winters

In [None]:
int(df_300.shape[0]*0.90)

In [None]:
split = int(df_cut.shape[0]*0.90) #gives 495
Train = df_cut[:split]
Test = df_cut[split-1:]

In [None]:
y_hat_avg = Test.copy()
fit1 = Holt(np.asarray(Train['300_movie'])).fit()
y_hat_avg['Holt_Winter'] = fit1.predict(start=split, end=df_cut.shape[0])

In [None]:
plt.figure(figsize=(16,8))
plt.plot(Train.index, Train['300_movie'], label='Train')
plt.plot(Test.index,Test['300_movie'], label='Test')
plt.plot(y_hat_avg.index,y_hat_avg['Holt_Winter'], label='Holt_Winter')
plt.legend(loc='best')

# Holt Winter with Exponential Smoothing

In [None]:
df_cut.index.freq = 'D' # Start of the month
train, test = df_cut.iloc[:split, 0], df_cut.iloc[split:, 0]

In [None]:
model = ExponentialSmoothing(train, trend='add', seasonal='add', seasonal_periods=12, damped=True)
hw_model = model.fit(optimized=True, use_boxcox=False, remove_bias=False)
pred = hw_model.predict(start=test.index[0], end=test.index[-1])


In [None]:
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
plt.plot(pred.index, pred, label='Holt-Winters')
plt.legend(loc='best');

# Forcasting with ARIMA

In [None]:
split = int(df_cut.shape[0]*0.90)
train, test = df_cut['300_movie'].iloc[0:split], df_cut['300_movie'].iloc[split:df_cut.shape[0]]
movie_300 = [x for x in train]

In [None]:
predictions = list()
for t in range(len(test)):
	model = ARIMA(movie_300, order=(5,1,4))
	model_fit = model.fit(disp=0)
	output = model_fit.forecast()
	yhat = output[0]
	predictions.append(yhat)
	obs = test[t]
	movie_300.append(obs)


In [None]:
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)

In [None]:
plt.plot(test.values)
plt.plot(predictions, color='red')

# Holt winter on full data set (300 Movie)

In [None]:
split = int(df_300.shape[0]*0.90) #gives 495
Train = df_300[:split]
Test = df_300[split-1:]

In [None]:
y_hat_avg = Test.copy()
fit1 = Holt(np.asarray(Train['300_movie'])).fit()
y_hat_avg['Holt_Winter'] = fit1.predict(start=split, end=df_300.shape[0])

In [None]:
plt.figure(figsize=(16,8))
plt.plot(Train.index, Train['300_movie'], label='Train')
plt.plot(Test.index,Test['300_movie'], label='Test')
plt.plot(y_hat_avg.index,y_hat_avg['Holt_Winter'], label='Holt_Winter')
plt.legend(loc='best')

## Holt Winters with Exponential Smoothing

In [None]:
df_300.index.freq = 'D' # Daily Data
split = int(df_300.shape[0]*0.90) 
train, test = df_300.iloc[:split, 0], df_300.iloc[split:, 0]

In [None]:
model = ExponentialSmoothing(train, trend='add', seasonal='add', seasonal_periods=12, damped=True)
hw_model = model.fit(optimized=True, use_boxcox=False, remove_bias=False)
pred = hw_model.predict(start=test.index[0], end=test.index[-1])

In [None]:
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
plt.plot(pred.index, pred, label='Holt-Winters')
plt.legend(loc='best');