In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load data
df = pd.read_csv("/kaggle/input/web-traffic-time-series-forecasting/train_1.csv.zip", compression = "zip")
df.head()

## Interchanging Rows and Column of data to get it in proper time series format

In [None]:
# Transpose rows and columns
df = df.T
df.head()

In [None]:
df = df.reset_index()
df.head()

In [None]:
# Capture 1st rows of data and make it column header
column_header = df.iloc[0,:].values

df.columns = column_header

In [None]:
df.head()

In [None]:
# Drop 1st row
df = df.drop(0, axis = 0)

In [None]:
df.head()

In [None]:
# Rename Page column to Date column
df = df.rename(columns = {"Page" : "Date"})

df.head()

In [None]:
# Check data type of date column
print(df["Date"].dtype)

In [None]:
# Convert Date column to datetime datatype
df["Date"] = pd.to_datetime(df["Date"])

In [None]:
# Check Date column datatype again
print(df["Date"].dtype)

In [None]:
# Set Date column as index
df = df.set_index("Date")

In [None]:
df.head()

Now the data is in proper time series format.

## Splitting data based on different access types and different agents.

In [None]:
# Finding number of access types and agents
access_types = []
agents = []
for column in df.columns:
    access_type = column.split("_")[-2]
    agent = column.split("_")[-1]
    access_types.append(access_type)
    agents.append(agent)

In [None]:
# Counting access types
from collections import Counter
access_dict = Counter(access_types)
access_dict

In [None]:
print("Number of topics with all-access type:", access_dict["all-access"])
print("Number of topics with desktop access:", access_dict["desktop"])
print("Number of topics with mobile-web access:", access_dict["mobile-web"])

In [None]:
access_df = pd.DataFrame({"Access type" : access_dict.keys(),
                          "Number of columns" : access_dict.values()})
access_df

In [None]:
# Counting agents
agents_dict = Counter(agents)
agents_dict

In [None]:
print("Number of topics with spider as agent:", agents_dict["spider"])
print("Number of topics with all-agents as agent:", agents_dict["all-agents"])

In [None]:
agents_df = pd.DataFrame({"Agent" : agents_dict.keys(),
                          "Number of columns" : agents_dict.values()})
agents_df

In [None]:
# Identifying number of columns with null values with respect to access type
def count_null_columns(pattern):
    pattern_columns = [column for column in df.columns if pattern in column]
    return len(df[pattern_columns].isnull().sum()[df[pattern_columns].isnull().sum() > 0])

no_of_cols_with_nulls = [count_null_columns(access_type) for access_type in access_df["Access type"]]

access_df["No of columns with nulls"] = no_of_cols_with_nulls

access_df

In [None]:
# Identifying number of columns with null values with respect to agents
def count_null_columns(pattern):
    pattern_columns = [column for column in df.columns if pattern in column]
    return len(df[pattern_columns].isnull().sum()[df[pattern_columns].isnull().sum() > 0])

no_of_cols_with_nulls = [count_null_columns(agent) for agent in agents_df["Agent"]]

agents_df["No of columns with nulls"] = no_of_cols_with_nulls

agents_df

In [None]:
# Calculating percentage of null values in access types
access_df["% of nulls"] = access_df["No of columns with nulls"] / access_df["Number of columns"] * 100

access_df

Percentage of missing values in each access type is almost same. So there is no pattern in missing values.

In [None]:
# Calculating percentage of null values in agents
agents_df["% of nulls"] = agents_df["No of columns with nulls"] / agents_df["Number of columns"] * 100

agents_df

Percentae of missing values is almost same with each agents. So there is no pattern in missing values.

## Splitting data based on different projects (like 'en.wikipedia.org')

In [None]:
df.columns[86543].split("_")[-3:]

In [None]:
"_".join(df.columns[86543].split("_")[-3:])

In [None]:
df.columns[86543]

In [None]:
projects = []
for column in df.columns:
    project = column.split("_")[-3] # Extracting language code from column name (topic name)
    projects.append(project)

In [None]:
project_dict = Counter(projects)
project_dict

In [None]:
project_df = pd.DataFrame({"Project" : project_dict.keys(),
                           "Number of columns" : project_dict.values()})

project_df

In [None]:
# Identifying number of columns with null values with respect to projects
def count_null_columns(pattern):
    pattern_columns = [column for column in df.columns if pattern in column]
    return len(df[pattern_columns].isnull().sum()[df[pattern_columns].isnull().sum() > 0])

no_of_cols_with_nulls = [count_null_columns(project) for project in project_df["Project"]]

project_df["No of columns with nulls"] = no_of_cols_with_nulls

project_df

In [None]:
# Calculating percentage of null values
project_df["% of nulls"] = project_df["No of columns with nulls"] / project_df["Number of columns"] * 100

project_df

In [None]:
project_df.sort_values(by = "% of nulls", ascending = False)

columns with projects commons.wikimedia.org and www.mediawiki.org have 48% columns with null values

In [None]:
df.columns

In [None]:
required_column_names = [column for column in df.columns if "commons.wikimedia.org" in column]

In [None]:
df[required_column_names].sum().mean()

In [None]:
df[required_column_names]

In [None]:
project_df["Project"]

In [None]:
def extract_total_views(project):
    required_column_names = [column for column in df.columns if project in column]
    total_views = df[required_column_names].sum().sum()
    return total_views

In [None]:
total_views = []
for project in project_df["Project"]:
    total_views.append(extract_total_views(project))
    
total_views

In [None]:
project_df["Total views"] = total_views
project_df

In [None]:
def extract_average_views(project):
    required_column_names = [column for column in df.columns if project in column]
    average_views = df[required_column_names].sum().mean()
    return average_views

In [None]:
average_views = []
for project in project_df["Project"]:
    average_views.append(extract_average_views(project))
    
average_views

In [None]:
project_df["Average views"] = average_views
project_df

In [None]:
project_df['Total views'] = project_df['Total views'].astype('int64')
project_df['Average views'] = project_df['Average views'].astype('int64')
project_df

In [None]:
project_df_sorted = project_df.sort_values(by = "Average views", ascending = False)
project_df_sorted

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize = (10,6))
sns.barplot(x = project_df_sorted["Project"], y = project_df_sorted["Average views"])
plt.xticks(rotation = "vertical")
plt.title("Average views per each project")
plt.show()

## Popular pages in "en.wikipedia.org"

In [None]:
en_wikipedia_org_columns = [column for column in df.columns if "en.wikipedia.org" in column]

top_pages_en = df[en_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_en

In [None]:
df[top_pages_en.index].plot(figsize = (16,9))

## Popular pages in "es.wikipedia.org"

In [None]:
es_wikipedia_org_columns = [column for column in df.columns if "es.wikipedia.org" in column]

top_pages_es = df[es_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_es

In [None]:
df[top_pages_es.index].plot(figsize = (16,9))

## Popular pages in "ru.wikipedia.org"

In [None]:
ru_wikipedia_org_columns = [column for column in df.columns if "ru.wikipedia.org" in column]

top_pages_ru = df[ru_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_ru

In [None]:
df[top_pages_ru.index].plot(figsize = (16,9))

## Popular pages in "de.wikipedia.org"

In [None]:
de_wikipedia_org_columns = [column for column in df.columns if "de.wikipedia.org" in column]

top_pages_de = df[de_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_de

In [None]:
df[top_pages_de.index].plot(figsize = (16,9))

## Popular pages in "ja.wikipedia.org"

In [None]:
ja_wikipedia_org_columns = [column for column in df.columns if "ja.wikipedia.org" in column]

top_pages_ja = df[ja_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_ja

In [None]:
df[top_pages_ja.index].plot(figsize = (16,9))

## Popular pages in "fr.wikipedia.org"

In [None]:
fr_wikipedia_org_columns = [column for column in df.columns if "fr.wikipedia.org" in column]

top_pages_fr = df[fr_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_fr

In [None]:
df[top_pages_fr.index].plot(figsize = (16,9))

## Popular pages in "zh.wikipedia.org"

In [None]:
zh_wikipedia_org_columns = [column for column in df.columns if "zh.wikipedia.org" in column]

top_pages_zh = df[zh_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_zh

In [None]:
df[top_pages_zh.index].plot(figsize = (16,9))

## Popular pages in "commons.wikipedia.org"

In [None]:
commons_wikipedia_org_columns = [column for column in df.columns if "commons.wikimedia.org" in column]

top_pages_commons = df[commons_wikipedia_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_commons

In [None]:
df[top_pages_commons.index].plot(figsize = (16,9))

## Popular pages in "www.mediawiki.org"

In [None]:
mediawiki_org_columns = [column for column in df.columns if "www.mediawiki.org" in column]

top_pages_mediawiki = df[mediawiki_org_columns].mean().sort_values(ascending = False)[0:5]
top_pages_mediawiki

In [None]:
df[top_pages_mediawiki.index].plot(figsize = (16,8))