In [None]:
%load_ext autoreload
%autoreload 2
import pyspark
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from functools import partial
from pprint import pprint
from pathlib import Path
from typing import Tuple
import datetime
import pandas as pd
import gc

In [None]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("download-pageview-complete") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

In [None]:
def download_wikipedia_pageview_complete_handler(
    item: Tuple[datetime.date, str],
    dest: PathLike,
    monthly: bool = False,
    kind: str = "user",
    force: bool = False,
) -> Tuple[datetime.date, PathLike]:
    date, url = item
    filename = Path(
        "/".join(
            dl.wikimedia_pageview_complete_local_file(date, monthly=monthly, kind=kind)
        )
    )
    destination = dest / filename
    pageview_columns = ["wiki_code", "page_title", "page_id", "user_client", "daily_total", "hourly_count"]
    
    def validate_file_func(_destination: PathLike) -> bool:
        ok_file = Path(_destination).with_suffix(".ok")
        if not force and _destination.exists() and ok_file.exists():
            return True
        if not force and _destination.exists():
            try:
                # try to open the file
                df = pd.read_csv(_destination, sep=' ', names=pageview_columns)
                del df
                gc.collect()
            except Exception as e:
                return False
            ok_file.touch()
            return True
        return False

    return date, dl.download_file(
        url, destination=destination, force=force, validate_file_func=validate_file_func
    )

In [None]:
pageview_complete_dest = Path("../hdd/pageview_complete")
end_date = datetime.date(2021, 10, 1)

In [None]:
# optional: remove all .ok files
# only do this if you really want to check all the csv files if they can be parsed!!!
# warning: it will take forever and use a lot of RAM!
if False:
    ok_files = list(pageview_complete_dest.rglob("**/*.ok"))
    for ok_file in ok_files:
        ok_file.unlink()

In [None]:
expected_daily_downloaded = []
for year in [2018, 2019, 2020, 2021]:
    daily_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 31),
    ))
    
    daily_range = [d for d in daily_range if (end_date - d).total_seconds() > 0]
    expected_daily_downloaded += daily_range
    
    print(f"downloading {len(daily_range)} days for year {year} ...")
    
    downloaded = sc.parallelize(
        dl.wikimedia_pageview_complete_urls(daily_range),
        numSlices=4
    ).map(
        partial(
            download_wikipedia_pageview_complete_handler,
            dest=pageview_complete_dest,
            monthly=False,
            force=False
        )
    ).collect()

In [None]:
expected_daily_downloaded_filenames = [
    Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False)))
    for date in expected_daily_downloaded
]

pageview_complete_dest = Path("../hdd/pageview_complete")
actual_filenames = [
    f.relative_to("../hdd/pageview_complete")
    for f in pageview_complete_dest.glob("**/*.bz2")
]

diff = set(expected_daily_downloaded_filenames) - set(actual_filenames)
assert len(diff) == 0

In [None]:
expected_monthly_downloaded = []
for year in [2018, 2019, 2020, 2021]:
    monthly_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 1),
        interval=relativedelta(months=+1),
    ))
    
    monthly_range = [d for d in monthly_range if (end_date - d).total_seconds() > 0]
    expected_monthly_downloaded += monthly_range
    
    print(f"downloading {len(monthly_range)} months for year {year} ...")
    
    downloaded = sc.parallelize(
        dl.wikimedia_pageview_complete_urls(monthly_range),
        numSlices=4
    ).map(
        partial(
            download_wikipedia_pageview_complete_handler,
            dest=pageview_complete_dest,
            monthly=True,
            force=False
        )
    ).collect()

In [None]:
# sc.stop()