In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

**Handling missing values:**
We noticed that there were a lot of missing values but the placement of those missing -NA- values were at the beginning of the data for individual time series. This meant that the web page was added to the domain after the given date and thus it was the best to replace them with 0.


In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv("/kaggle/input/web-traffic-time-series-forecasting/train_1.csv").fillna(0) #handling missing values
train = train.replace(np.nan,0) #handling missing values

In [None]:
from matplotlib import rcParams
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
rcParams['figure.figsize'] = 18,8
y = train.loc[1][1:]
plt.plot(y) #plot a random time series to get the idea of what our data looks like
plt.xlabel('Date-Time', fontsize=10)
plt.ylabel('Traffic', fontsize=10)
plt.title('Web Traffic- Original data')
plt.show()

In [None]:
#first order and second order differencing to enforce stationarity
first_order = y.diff()
second_order = first_order.diff()
plt.plot(y)
plt.xlabel('Date-Time', fontsize=10)
plt.ylabel('Traffic', fontsize=10)
plt.title('Web Traffic- Original data')
plt.show()
plt.plot(first_order)
plt.xlabel('Date-Time', fontsize=10)
plt.ylabel('Traffic', fontsize=10)
plt.title('Web Traffic- First Order difference')
plt.show()
plt.plot(second_order)
plt.xlabel('Date-Time', fontsize=10)
plt.ylabel('Traffic', fontsize=10)
plt.title('Web Traffic- Second Order difference')
plt.show()

Time series data can exhibit a variety of patterns, and it is often helpful to split a time series into several components, each representing an underlying pattern category.

There are three types of time series patterns: *trend, seasonality and cycles*. When we decompose a time series into components, we usually combine the trend and cycle into a single trend-cycle component (sometimes called the trend for simplicity). Thus we think of a time series as comprising three components: a trend-cycle component, a seasonal component, and a remainder component (containing anything else in the time series).

In [None]:
import statsmodels.api as sm
series = y
cycle, trend = sm.tsa.filters.hpfilter(series, 50) #time series decomposition
fig, ax = plt.subplots(3,1)
ax[0].plot(series)
ax[0].set_title('Actual')
ax[1].plot(trend)
ax[1].set_title('Trend')
ax[2].plot(cycle)
ax[2].set_title('Cycle')
plt.show()

In [None]:
ind = pd.to_datetime(y.index)
arr = []
for i in range(len(y)):
    arr.append(y[i])
arr = pd.DataFrame(arr)
arr.index = ind
decomposition = sm.tsa.seasonal_decompose(arr) 
decomposition.plot();

**ACF** is an (complete) auto-correlation function which gives us values of auto-correlation of any series with its lagged values. We plot these values along with the confidence band and tada! We have an ACF plot. In simple terms, it describes how well the present value of the series is related with its past values. A time series can have components like trend, seasonality, cyclic and residual. ACF considers all these components while finding correlations hence it’s a ‘complete auto-correlation plot’.

**PACF** is a partial auto-correlation function. Basically instead of finding correlations of present with lags like ACF, it finds correlation of the residuals (which remains after removing the effects which are already explained by the earlier lag(s)) with the next lag value hence ‘partial’ and not ‘complete’ as we remove already found variations before we find the next correlation. So if there is any hidden information in the residual which can be modeled by the next lag, we might get a good correlation and we will keep that next lag as a feature while modeling. Remember while modeling we don’t want to keep too many features which are correlated as that can create multicollinearity issues. Hence we need to retain only the relevant features.

In [None]:
#acf and pacf
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
plot_acf(arr);
plot_pacf(arr);
acf_values= sm.tsa.stattools.acf(y)
pacf_values= sm.tsa.stattools.pacf(y)

What are the types of the website?
1. Based On LANGUAGE
2. Based On ACCESS 
3. Based On TYPE

In [None]:
#Distributing data based on language
lang_dict = dict()
for i in range(len(train)):
    lang = train['Page'][i][train['Page'][i].find(".wikipedia")-2:train['Page'][i].find(".wikipedia")]
    temp=train.loc[i]
    if(lang not in lang_dict.keys()):
        lang_dict[lang] = [temp]
    else:
        lang_dict[lang].append(temp)

In [None]:
import numpy as np
from matplotlib import pyplot as plt
i=0
data_lang =[]
for lang in lang_dict.keys():
        data_lang.append([lang])
        for j in range(len(lang_dict[lang])):
            data_lang[i].append(sum(lang_dict[lang][j][1:]))
        i=i+1
stats_lang = []
for lang in data_lang:
    stats_lang.append([lang[0],sum(lang[1:]),len(lang[1:]),np.mean(lang[1:]),np.std(lang[1:])])
import pandas as pd
stats_lang = pd.DataFrame(stats_lang[:8])

index = np.arange(len(stats_lang))
plt.bar(index,stats_lang[:][3])
plt.xlabel('Language', fontsize=10)
plt.ylabel('Mean of web hits', fontsize=10)
plt.xticks(index, stats_lang[:][0], fontsize=10, rotation=30)
plt.title('Web Traffic mean based on language')
plt.show()

In [None]:
#Distibuting Data based on access type and type
type_dict = {"all-agent":list(),"spider":list()}
access_dict = {"access_dict":list(),"desktop":list(),"mobile-web":list()}
for i in range(len(train)):
    if("all-access" in train["Page"][i][train['Page'][i].find(".wikipedia"):]):
        access_dict["access_dict"].append(train.loc[i])
    if("desktop" in train["Page"][i][train['Page'][i].find(".wikipedia"):]):
        access_dict["desktop"].append(train.loc[i])
    if("mobile-web" in train["Page"][i][train['Page'][i].find(".wikipedia"):]):
        access_dict["mobile-web"].append(train.loc[i]) 
    if("all-agent" in train["Page"][i][train['Page'][i].find(".wikipedia"):]):
        type_dict["all-agent"].append(train.loc[i])
    if("spider" in train["Page"][i][train['Page'][i].find(".wikipedia"):]):
        type_dict["spider"].append(train.loc[i])

In [None]:
i=0
data_type =[]
for type_x in type_dict.keys():
        data_type.append([type_x])
        for j in range(len(type_dict[type_x])):
            data_type[i].append(sum(type_dict[type_x][j][1:]))
        i=i+1
stats_type = []
for type_x in data_type:
    stats_type.append([type_x[0],sum(type_x[1:]),len(type_x[1:]),np.mean(type_x[1:]),np.std(type_x[1:])])
import pandas as pd
stats_type = pd.DataFrame(stats_type[:8])

index = np.arange(len(stats_type))
plt.bar(index,stats_type[:][3])
plt.xlabel('Type', fontsize=10)
plt.ylabel('Mean of web hits', fontsize=10)
plt.xticks(index, stats_type[:][0], fontsize=10, rotation=30)
plt.title('Web Traffic mean based on Type')
plt.show()

In [None]:
i=0
data_access =[]
for access_x in access_dict.keys():
        data_access.append([access_x])
        for j in range(len(access_dict[access_x])):
            data_access[i].append(sum(access_dict[access_x][j][1:]))
        i=i+1
stats_access = []
for access_x in data_access:
    stats_access.append([access_x[0],sum(access_x[1:]),len(access_x[1:]),np.mean(access_x[1:]),np.std(access_x[1:])])
import pandas as pd
stats_access = pd.DataFrame(stats_access[:8])

index = np.arange(len(stats_access))
plt.bar(index,stats_access[:][3])
plt.xlabel('access', fontsize=10)
plt.ylabel('Mean of web hits', fontsize=10)
plt.xticks(index, stats_access[:][0], fontsize=10, rotation=30)
plt.title('Web Traffic mean based on access')
plt.show()

**Loading the required modules**

In [None]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt
import itertools
import statsmodels.api as sm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor , RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from matplotlib import rcParams
train = train.sample(10)

**Split the Data**
Here the data is split based on Look Back n steps for Prediction of next value for regression based models.

In [None]:
def split_sequence(sequence, n_steps):
    X, Y = list(), list()
    for i in range(len(sequence)):
        end_ix = i + n_steps
        if end_ix > len(sequence)-1:
            break
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        Y.append(seq_y)
    return np.array(X),np.array(Y)

**LSTM Model Tempelate**

In [None]:
def LSTM_MODEL(n,n_steps):
    n_features = 1
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(n_steps, n_features),return_sequences = True))
    for layer in range(n):
        model.add(LSTM(50, activation='relu',return_sequences = True))
    model.add(LSTM(50, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

**Proposed Idea:**
Multiple learners can be used to predict time-series data. This is called Ensemble Learning. Ensemble learning is the process by which multiple models, such as classifiers or experts, are strategically generated and combined to solve a particular computational intelligence problem. Ensemble learning is primarily used to improve the performance of a model, or reduce the likelihood of an unfortunate selection of a poor one. This can be used to provide better prediction for time-series data. 
**Models Implemented:**
1. SARIMAX: We made the code for the SARIMAX model and fine-tuned these parameters and found that p=1, d=0, q=2, P=1, D=1, Q=2 and m=12 gave the least value of RMSE for a general model for our entire training set.
2. RNN and LSTM based Models: Here in our case, we made RNN and LSTM models to predict Time Series and as a first step we made a single layered(vanilla RNN) with 7 day look back which gave good results but did not work well as general model. Then we fine-tuned the parameters, i.e. number of layers and look back time and found that 4 layered with a 30 day look back time gave the best results as a general model.
3. Adaboost Regressor: For our training data, we got best results with Decision Tree Regressor with max depth 4 as base estimator, learning rate as 0.01, n estimators as 5000 and random state as 42.
4. Gradient Boost Regressor: For our training data, we got best results with n estimators as 500, learning rate as 0.1, max depth=4 and loss='ls'.
5. Random Forrest Regressor:Here are the Tune-able Parameters which gave the best results for our data, max depth as 4 and n estimators as 500.

All these models are compared based on the RMSE value for the test data that is predicting data for the next whole year.

In [None]:
rcParams['figure.figsize'] = 18,8
list_models = []
regr_1 = DecisionTreeRegressor(max_depth=4)
adaboostSVC = AdaBoostRegressor(n_estimators = 500, random_state = 42, learning_rate=0.01, base_estimator=regr_1)
est = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=4, random_state=0, loss='ls')
regressor = RandomForestRegressor(max_depth=4 , random_state=0, n_estimators=500)
p,d,q = 1,0,2
for time_series in train.index:  
    
    #extracting the time series
    error_list = []
    list_of_model_pred = []
    print("For Time series:", time_series)
    print(train.loc[time_series][0])
    y = train.loc[time_series][1:]
    ind = pd.to_datetime(y.index)
    arr = []
    for i in range(len(y)):
        arr.append(y[i])
    arr = pd.DataFrame(arr)
    arr.index = ind
    #split the dataset into training and testing data
    test_X,test_Y = split_sequence(arr[0][-365:],30)
    train_X,train_Y = split_sequence(arr[0][0:-365],30)
    list_of_model_pred.append(y[-335:])
    #Adaboost Model
    model = adaboostSVC.fit(train_X, train_Y)
    pred_Y = model.predict(test_X)
    rmse = sqrt(mean_squared_error(test_Y,pred_Y))
    error_list.append(rmse)
    print("Adaboost Done with error: ",rmse)
    list_of_model_pred.append(pred_Y)

    #Gradient Boosting
    est.fit(train_X, train_Y)
    pred_Y = est.predict(test_X)
    rmse = sqrt(mean_squared_error(test_Y, pred_Y))
    error_list.append(rmse)
    print("Gradient Boost Done with error: ",rmse)
    list_of_model_pred.append(pred_Y)
    
    #Random Forrest
    regressor.fit(train_X,train_Y)
    pred_Y = regressor.predict(test_X)
    rmse = sqrt(mean_squared_error(test_Y,pred_Y))
    error_list.append(rmse)
    print("Random Forest Regressor Done with error: ",rmse)
    list_of_model_pred.append(pred_Y)
    
    # RNN and LSTM Model
    n_features = 1
    model = LSTM_MODEL(4,30)
    train_X1 = train_X.reshape((train_X.shape[0], train_X.shape[1], n_features))
    model.fit(train_X1, train_Y, epochs=200, verbose=0)
    test_X1 = test_X.reshape((test_X.shape[0], test_X.shape[1], n_features))
    pred_Y = model.predict(test_X1, verbose=0)
    rmse = sqrt(mean_squared_error(test_Y,pred_Y))
    error_list.append(rmse)
    print("RNN and LSTM Done with error: ",rmse)
    list_of_model_pred.append(pred_Y)
    
    #Sarimax Model
    y = train.loc[time_series][1:]
    mod = sm.tsa.statespace.SARIMAX(arr[:-365],
                                order=(p, d, q),
                                seasonal_order=(1,1,2, 12),
                                enforce_stationarity=True,
                                enforce_invertibility=False)
    results = mod.fit()
    pred = results.get_prediction(start=pd.to_datetime('2016-01-01'),end=pd.to_datetime('2016-12-31') )
    rmse = sqrt(mean_squared_error(arr['2016-01-01':'2017-01-02'], pred.predicted_mean))
    print("SARIMAX Done with error: ",rmse)
    list_of_model_pred.append(pred_Y)
    error_list.append(rmse)
    
    #Plot predicted vs Original for all Models
    label_list = ["Original Time Series","Adaboost","Gradient Boost","Random Forest","RNN and LSTM","SARIMAX"]
    plt.style.use('seaborn-darkgrid')
    palette = plt.get_cmap('Dark2')
    for i in range(len(list_of_model_pred)):
        plt.subplot(3,2, i+1)
        if(i!=0):
            plt.plot(list_of_model_pred[0], marker='', color='grey', linewidth=0.6, alpha=0.3)
            plt.xlabel('Date', fontsize=10)
            plt.ylabel('Number of web hits', fontsize=10)
        plt.title(label_list[i], loc='left', fontsize=12, fontweight=0, color=palette(i))
        plt.plot(list_of_model_pred[i], marker='', color=palette(i), linewidth=2.4, alpha=0.9, label=label_list[i])
        plt.xlabel('Date', fontsize=10)
        plt.ylabel('Number of web hits', fontsize=10)
    plt.suptitle(train.loc[time_series][0], fontsize=13, fontweight=0, color='black', style='italic', y=1.02)
    list_models.append(error_list)
    plt.show()

**Evaluation**

Error Comparison of different Models Based on RMSE value over the next year data.

In [None]:
rcParams['figure.figsize'] = 18,8
list_models = pd.DataFrame(list_models)
index = ["time series: "+str(i) for i in train.index]
list_models.index = index
list_models.columns = [i for i in label_list[1:]]
list_models.head(10)
list_models.plot.line()
plt.title("ERROR")
plt.show()
list_models.boxplot()
plt.title("ERROR")
plt.show()