# Web traffic time series forecast

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('fivethirtyeight')
sns.set_style('whitegrid')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
train1 = pd.read_csv("/kaggle/input/web-traffic-time-series-forecasting/train_1.csv.zip", parse_dates=True)

In [None]:
train1.head()

In [None]:
train1.isna().sum()

## Data Visualization

In [None]:
train_pivot = train1.melt(id_vars='Page', var_name='Date', value_name='Visits')

In [None]:
train_pivot['Date'] = pd.to_datetime(train_pivot['Date'])

In [None]:

train_pivot['Year'] = train_pivot['Date'].dt.year
train_pivot['Month'] = train_pivot['Date'].dt.month
train_pivot['Day'] = train_pivot['Date'].dt.day

In [None]:
def visualize_visits(df, groupby, figsize, title, **kwargs):
    plt.figure(figsize=figsize)
    group_by = df[[groupby, 'Visits', 'Page']].groupby([groupby])['Visits'].mean()
    plt.plot(group_by, **kwargs)
    plt.title(title)
    plt.show()

In [None]:
visualize_visits(train_pivot, 'Date', (50, 10), 'Visits by Date')

In [None]:
visualize_visits(train_pivot, 'Month', (20, 6), title='Visits by Month')

In [None]:
visualize_visits(train_pivot, 'Day', (20, 6), 'Visits by Day')

## Page with most Visits

In [None]:
page_visits = train_pivot[['Page', 'Visits']].groupby('Page')['Visits'].sum().sort_values(ascending=False)
page_visit = pd.DataFrame({'Page':page_visits.index, 'Visits':list(page_visits)})

In [None]:
name = []
for page in page_visit['Page']:
    split = page.split('.')
    name.append(split[-3])

In [None]:
page_visit['Name'] = name
page_visit['Language'] = page_visit['Name'].str[-2:]

In [None]:
plt.figure(figsize=(10, 7))
top=20
top_visit = page_visit.iloc[:top]
sns.barplot(data=top_visit, y='Page', x='Visits');

## Visualize Access agent, Language and project

In [None]:
name = []
project = []
access_agent = []

for page in train1['Page']:
    split = page.split('.')
    name.append(split[-3])
    project.append(split[-2])
    access_agent.append(split[-1])

In [None]:
train1['Name'] = name
train1['Project'] = project
train1['access_agent'] = access_agent
train1['Language'] = train1['Name'].str[-2:]

In [None]:
sns.countplot(train1['Project']);

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(train1['access_agent']);

In [None]:
sns.countplot(train1['Language']);

## Visualize the visits by language

In [None]:
remove_col = ['Language', 'Name', 'Project', 'access_agent']

# List of languages 
languages = ['es', 'zh', 'fr', 'en', 'ns', 'ru', 'ww', 'de', 'ja']

color = ['b', 'r', 'c', 'g', 'm', 'y', 'k', 'teal', 'lime']
c = 0

for lang in languages:
    
    # create df
    df = train1[train1['Language'] == lang]
    
    # Drop the columns which are not required
    pv = df.drop(remove_col, axis=1)
    
    # Pivot the data for visualization
    pivot = pv.melt(id_vars='Page', var_name='Date', value_name='Visits')
    pivot['Date'] = pd.to_datetime(pivot['Date'])
    
    # Call the function for visualization
    visualize_visits(pivot, 'Date', (50, 8), title=f'Visits by Date, Language: {lang}', color=color[c])
    c += 1

In [None]:
# Plot the 10 most visited pages according to languages
plt.figure(figsize=(10, 45))

# Top pages to show
top = 10
c = 1

for lang in languages:
    
    # create df
    df_lang = train1[train1['Language'] == lang]
    
    # Drop the columns which are not required
    pv = df_lang.drop(remove_col, axis=1)
    
    # Pivot the data for visualization
    pivot = pv.melt(id_vars='Page', var_name='Date', value_name='Visits')
    pivot['Date'] = pd.to_datetime(pivot['Date'])
    
    # Group the page and sum their visits
    visit_lang = pivot[['Page', 'Visits']].groupby('Page')['Visits'].sum().sort_values(ascending=False)
    visit_lang_df = pd.DataFrame({'Page':visit_lang.index, 'Visits':list(visit_lang)})
    
    # Plot the top visits
    top_visit = visit_lang_df.iloc[:top]
    plt.subplot(9, 1, c)
    title = f'Top {top} visits, Language: {lang}'
    sns.barplot(data=top_visit, y='Page', x='Visits').set_title(title);
    c += 1

## Visualize the sudden increase in visits

* From the first figure we can see that, there was sudden increase in number of visits from the date 2016-07 to 2016-09 and the similar increase in visits was also found in figure where the language was *en* and *ru*. 


In [None]:
plt.figure(figsize=(10, 20))
top = 20
c = 1

for lang in ['en', 'ru']:
    
    df = train1[train1['Language'] == lang]
    
    # Drop the columns which are not required
    pv = df.drop(remove_col, axis=1)
    
    # Pivot the data for visualization
    pivot = pv.melt(id_vars='Page', var_name='Date', value_name='Visits')
    pivot = pivot[(pivot['Date'] > '2016-07-01') & (pivot['Date'] < '2016-09-01')]
    pivot['Date'] = pd.to_datetime(pivot['Date'])
    
    visualize_visits(pivot, 'Date', (50, 8), title=f'Visits by Date, Language: {lang}')

* For exactly 1 month from the date 2016-07-15 to 2016-08-15 the visits were high, visualizing the top visited pages during this interval

In [None]:
# Plot the 10 most visited pages according to languages
plt.figure(figsize=(10, 15))

# Top pages to show
top = 20
c = 1

for lang in ['en', 'ru']:
    
    # create df
    df_lang = train1[train1['Language'] == lang]
    
    # Drop the columns which are not required
    pv = df_lang.drop(remove_col, axis=1)
    
    # Pivot the data for visualization
    pivot = pv.melt(id_vars='Page', var_name='Date', value_name='Visits')
    pivot = pivot[(pivot['Date'] > '2016-07-15') & (pivot['Date'] < '2016-08-15')]
    pivot['Date'] = pd.to_datetime(pivot['Date'])
    
    # Group the page and sum their visits
    visit_lang = pivot[['Page', 'Visits']].groupby('Page')['Visits'].sum().sort_values(ascending=False)
    visit_lang_df = pd.DataFrame({'Page':visit_lang.index, 'Visits':list(visit_lang)})
    
    # Plot the top visits
    top_visit = visit_lang_df.iloc[:top]
    plt.subplot(2, 1, c)
    title = f'Top {top} visits, Language: {lang}'
    sns.barplot(data=top_visit, y='Page', x='Visits').set_title(title);
    c += 1

* After converting the russian webpage to english, we get that the visits were high due to summer olympics and suicide squad movie. Apart from the visits of homepage these two were common pages for both the language and therefore, this is the reason for sudden increase in visits.

## Auto correlation

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

In [None]:
acf = train_pivot[['Date', 'Visits']].groupby('Date')['Visits'].mean()

In [None]:
plot_acf(acf, lags=20)

## Partial Correlation

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf

In [None]:
plot_pacf(acf, lags=20)

Observations:
* ACF plot - There is a geometric decrease in lags.
* PACF plot - There is a drop in correlation after 2 lags.