In [None]:
%load_ext autoreload
%autoreload 2
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from dateutil.relativedelta import relativedelta
from functools import partial
from pprint import pprint
from pathlib import Path
from typing import Tuple
import itertools
import pandas as pd
from dask import dataframe as dd
import matplotlib.pyplot as plt
import datetime
import gc
import numpy as np
from tqdm import tqdm

In [None]:
plt.rcParams['figure.figsize'] = (8,4)

In [None]:
# data exploration statistics
years = [2018] # , 2019, 2020, 2021]
months = range(1,13)
days = [1] # , 15]
end_date = datetime.date(2021, 10, 1)
pageview_samples = [
    datetime.date(year, month, day) for year, month, day in itertools.product(years, months, days)
]
pageview_samples = [d for d in pageview_samples if (end_date - d).total_seconds() > 0]

pageview_src_complete_dir = Path("../hdd/pageview_complete")
pageview_complete_dir = Path("../nvme/pageview_complete")
pageview_sample_files = [
    pageview_complete_dir / "/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))
    for date in pageview_samples
]
print(f"analyzing {len(pageview_sample_files)} files")
pprint(pageview_sample_files)

In [None]:
for year in [2018, 2019, 2020, 2021]:
    print("%d: checked %d of %d" % (
        year,
        len(list((pageview_src_complete_dir / str(year)).rglob("**/*.ok"))),
        len(list((pageview_src_complete_dir / str(year)).rglob("**/*.bz2")))
    ))
    pprint(list(
            set([str(p.with_suffix(".ok")) for p in (pageview_src_complete_dir / str(year)).rglob("**/*.bz2")]) - 
            set([str(p) for p in (pageview_src_complete_dir / str(year)).rglob("**/*.ok")])
    ))

In [None]:
pageview_columns = ["wiki_code", "page_title", "page_id", "user_client", "daily_total", "hourly_count"]
index_cols = ["wiki_code", "page_id"]
pageview_dtype = {
    "wiki_code": "category",
    # "page_title": "string",
    "page_id": "category", # will be parsed to int
    # "user_client": "string",
    "daily_total": "string", # will be parsed to int
    # "hourly_count": "string", # from 0 to 23, written as 0 = A, 1 = B ... 22 = W, 23 = X, will be parsed
}

languages = pd.read_csv("./data/languages.csv", index_col="code")
languages.head()

In [None]:
# load and combine all raw daily csv files
dfs = None
for date, f in tqdm(list(zip(pageview_samples, pageview_sample_files))):
    df = pd.read_csv(
        f,
        sep=' ',
        names=pageview_columns,
        low_memory=False,
        dtype=pageview_dtype,
        on_bad_lines="skip",
        # nrows=1_000_000,
        usecols=["wiki_code", "page_id", "daily_total"], # "page_title"
        engine="c",
    )
    
    # parse and reduce 
    df = df[df["wiki_code"].notna()]
    df["wiki_code"] = df["wiki_code"].apply(lambda c: c.split(".")[0])
    df["page_id"] = pd.to_numeric(df["page_id"], errors='coerce')
    df = df[df["page_id"].notna()]
    df["page_id"] = df["page_id"].astype("int32")
    df["daily_total"] = pd.to_numeric(df["daily_total"], errors='coerce')
    df["daily_total"] = df["daily_total"].fillna(0).astype("Int32")
    df = df.rename(columns={
        "daily_total": f"{date.year}{str(date.month).zfill(2)}{str(date.day).zfill(2)}_daily_total",
    })
    df = df.set_index(index_cols)
    df = df.groupby(index_cols).sum()
    if dfs is None:
        dfs = df
    else:
        dfs = dfs.join(df, on=index_cols, how="outer") # "page_title"

    del df
    gc.collect()
    
    # print(dfs.dtypes)
    print(dfs.shape)

dfs.head()

In [None]:
total_page_count = dfs.shape[0]
assert dfs.groupby(index_cols).count().shape[0] == total_page_count

In [None]:
plot_dir = Path("./plots")
plot_dir.mkdir(parents=True, exist_ok=True)

In [None]:
daily_page_count = dfs.loc[:, ~dfs.columns.isin(['mean'])]
dfs["mean"] = daily_page_count.mean(axis=1).astype("int32")
total_page_views = dfs["mean"].sum()
dfs.head()

In [None]:
# save the samples
dfs.to_csv(pageview_src_complete_dir / "samples.csv", index=True) # .bz2

In [None]:
# % of pages with less than X views per day (can we remove them?)
min_views = 100
less_than_min_views = (dfs["mean"] < min_views).sum()
print("percent of pages with less than %d views per day: %.2f" % (
    min_views, 100*less_than_min_views/total_page_count
)) 

In [None]:
# How many % of total page views make up the top N most popular pages?
top = dfs.sort_values(by="mean", ascending=False)
for n in [100, 1_000, 10_000, 100_000]:
    top_n = top.iloc[:n,:]
    print("percent of total page views from top %d (%.5f percent) pages only: %.2f" % (
        n, n/total_page_count*100, 100*top_n["mean"].sum()/total_page_views
    ))

In [None]:
# mean and std dev of page views per day (absolute)
print("mean absolute page views per day: %.2f" % (daily_page_count.mean(axis=1).mean()))
print("stddev absolute page views per day: %.2f" % (daily_page_count.mean(axis=1).std()))

In [None]:
# mean and std page views per country code per day (absolute and relative)
per_country = daily_page_count.groupby("wiki_code").sum()

per_country["abs_mean"] = per_country.mean(axis=1).astype("int")
per_country["rel_mean"] = (per_country / total_page_views).mean(axis=1)

per_country["abs_std"] = per_country.std(axis=1).astype("int")
per_country["rel_std"] = (per_country / total_page_views).std(axis=1)

per_country = per_country.sort_values(by="abs_mean", ascending=False)
per_country = pd.merge(per_country.reset_index(), languages, left_on='wiki_code', right_on='code')

n = 20
fig, ax = plt.subplots()
ax.bar(per_country.iloc[:n,:]["name"].astype(str), per_country.iloc[:n,:]["abs_mean"])
plt.xticks(rotation=45)
plt.xlabel("absolute mean daily page views per language")
plt.ylabel("language")
plt.title("absolute mean daily page views for top %d languages" % n)
plt.tight_layout()
plt.savefig(plot_dir / ("mean_daily_pageviews_top_%d_languages.pdf" % n))

In [None]:
# mean and std pages visited per day (absolute)
print("mean pages visited per day: %.2f" % (daily_page_count.notna().sum().mean()))
print("stddev pages visited per day: %.2f" % (daily_page_count.notna().sum().std()))

In [None]:
# mean and std pages visited per country code per day (absolute and relative)
per_country = daily_page_count.groupby("wiki_code").apply(lambda x: x.notna().sum()).astype("int")

per_country["abs_mean"] = per_country.mean(axis=1).astype("int")
per_country["rel_mean"] = (per_country / total_page_count).mean(axis=1)

per_country["abs_std"] = per_country.std(axis=1).astype("int")
per_country["rel_std"] = (per_country / total_page_views).std(axis=1)

per_country = per_country.sort_values(by="abs_mean", ascending=False)
per_country = pd.merge(per_country.reset_index(), languages, left_on='wiki_code', right_on='code')

n = 20
fig, ax = plt.subplots()
ax.bar(per_country.iloc[:n,:]["name"].astype(str), per_country.iloc[:n,:]["abs_mean"])
plt.xticks(rotation=45)
plt.xlabel("absolute mean pages visited per day per language")
plt.ylabel("language")
plt.title("absolute mean pages visited for top %d languages" % n)
plt.tight_layout()
plt.savefig(plot_dir / ("mean_daily_pages_visited_top_%d_languages.pdf" % n))

In [None]:
# for all pages, in how many other days do they occur? (relative)
shared_page_count = dfs[dfs.notna().all(axis=1)].shape[0]
print("percent of shared pages: %.2f" % (shared_page_count/total_page_count)) 

In [None]:
# for all days, how many percent of pages are found in more than X% of days
min_days_percent = 0.75
page_occurences = daily_page_count.notna().sum(axis=1)
page_occurences = (page_occurences / (daily_page_count.shape[1])) >= min_days_percent
page_occurences = page_occurences.sum()
print("percent of pages present in more than %d percent of days: %.2f" % (
    int(min_days_percent * 100),
    page_occurences/total_page_count
))

In [None]:
# mean storage size per raw csv (daily and monthly)
monthly_csv = set((pageview_src_complete_dir / "monthly").rglob("**/*.bz2"))
daily_csv = set(pageview_src_complete_dir.rglob("**/*.bz2")) - monthly_csv
print("average file size daily csv: %.2f MB (%d files)" % (
    np.array([f.stat().st_size for f in daily_csv]).mean() / 1024**2, len(daily_csv)))
print("average file size monthly csv: %.2f MB (%d files)" % (
    np.array([f.stat().st_size for f in monthly_csv]).mean() / 1024**2, len(monthly_csv)))

In [None]:
page_occurences = daily_page_count.notna().sum().mean()
print("mean number of rows per daily pageview csv: %d" % int(page_occurences))