In [1]:
%load_ext autoreload
%autoreload 2
import pyspark
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from functools import partial
from pprint import pprint
from pathlib import Path
from typing import Tuple
import datetime
import pandas as pd
import gc

In [2]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("download-pageview-complete") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

21/10/10 23:35:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
def download_wikipedia_pageview_complete_handler(
    item: Tuple[datetime.date, str],
    dest: PathLike,
    monthly: bool = False,
    kind: str = "user",
    force: bool = False,
) -> Tuple[datetime.date, PathLike]:
    date, url = item
    filename = Path(
        "/".join(
            dl.wikimedia_pageview_complete_local_file(date, monthly=monthly, kind=kind)
        )
    )
    destination = dest / filename
    pageview_columns = ["wiki_code", "page_title", "page_id", "user_client", "daily_total", "hourly_count"]
    
    def validate_file_func(_destination: PathLike) -> bool:
        ok_file = Path(_destination).with_suffix(".ok")
        if not force and _destination.exists() and ok_file.exists():
            return True
        if not force and _destination.exists():
            try:
                # try to open the file
                df = pd.read_csv(_destination, sep=' ', names=pageview_columns)
                del df
                gc.collect()
            except Exception as e:
                return False
            ok_file.touch()
            return True
        return False

    return date, dl.download_file(
        url, destination=destination, force=force, validate_file_func=validate_file_func
    )

In [4]:
pageview_complete_dest = Path("../hdd/pageview_complete")
end_date = datetime.date(2021, 10, 1)

In [10]:
# optional: remove all .ok files
if False:
    ok_files = list(pageview_complete_dest.rglob("**/*.ok"))
    for ok_file in ok_files:
        ok_file.unlink()

In [None]:
expected_daily_downloaded = []
for year in [2018, 2019, 2020, 2021]:
    daily_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 31),
    ))
    
    daily_range = [d for d in daily_range if (end_date - d).total_seconds() > 0]
    expected_daily_downloaded += daily_range
    
    print(f"downloading {len(daily_range)} days for year {year} ...")
    
    downloaded = sc.parallelize(
        dl.wikimedia_pageview_complete_urls(daily_range),
        numSlices=4
    ).map(
        partial(
            download_wikipedia_pageview_complete_handler,
            dest=pageview_complete_dest,
            monthly=False,
            force=False
        )
    ).collect()

downloading 365 days for year 2018 ...


downloading file ../hdd/pageview_complete/2018/2018-01/pageviews-20180101-user.bz2 ...
  if not validate_file_func or validate_file_func(destination):
using existing file ../hdd/pageview_complete/2018/2018-10/pageviews-20181001-user.bz2 ...
  if not validate_file_func or validate_file_func(destination):
  if not validate_file_func or validate_file_func(destination):
using existing file ../hdd/pageview_complete/2018/2018-07/pageviews-20180702-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-04/pageviews-20180402-user.bz2 ...
  pass
  if not validate_file_func or validate_file_func(destination):
  if not validate_file_func or validate_file_func(destination):
using existing file ../hdd/pageview_complete/2018/2018-07/pageviews-20180703-user.bz2 ...
downloading file ../hdd/pageview_complete/2018/2018-01/pageviews-20180102-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-10/pageviews-20181002-user.bz2 ...
using existing file ../hdd/pageview_complete/2018

using existing file ../hdd/pageview_complete/2018/2018-07/pageviews-20180720-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-01/pageviews-20180115-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-04/pageviews-20180420-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-07/pageviews-20180721-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-10/pageviews-20181019-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-01/pageviews-20180116-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-04/pageviews-20180421-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-07/pageviews-20180722-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-10/pageviews-20181020-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-01/pageviews-20180117-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-04/pageviews-20180422-user.bz2 ...
using exis

using existing file ../hdd/pageview_complete/2018/2018-05/pageviews-20180512-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-08/pageviews-20180812-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-11/pageviews-20181109-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-02/pageviews-20180207-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-05/pageviews-20180513-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-08/pageviews-20180813-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-11/pageviews-20181110-user.bz2 ...
downloading file ../hdd/pageview_complete/2018/2018-08/pageviews-20180814-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-02/pageviews-20180208-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-05/pageviews-20180514-user.bz2 ...
using existing file ../hdd/pageview_complete/2018/2018-11/pageviews-20181111-user.bz2 ...
using existin

In [6]:
expected_daily_downloaded_filenames = [
    Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False)))
    for date in expected_daily_downloaded
]

pageview_complete_dest = Path("../hdd/pageview_complete")
actual_filenames = [
    f.relative_to("../hdd/pageview_complete")
    for f in pageview_complete_dest.glob("**/*.bz2")
]

diff = set(expected_daily_downloaded_filenames) - set(actual_filenames)
assert len(diff) == 0

In [7]:
expected_monthly_downloaded = []
for year in [2018, 2019, 2020, 2021]:
    monthly_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 1),
        interval=relativedelta(months=+1),
    ))
    
    monthly_range = [d for d in monthly_range if (end_date - d).total_seconds() > 0]
    expected_monthly_downloaded += monthly_range
    
    print(f"downloading {len(monthly_range)} months for year {year} ...")
    
    downloaded = sc.parallelize(
        dl.wikimedia_pageview_complete_urls(monthly_range),
        numSlices=4
    ).map(
        partial(
            download_wikipedia_pageview_complete_handler,
            dest=pageview_complete_dest,
            monthly=True,
            force=False
        )
    ).collect()

downloading 12 months for year 2018 ...
downloading 12 months for year 2019 ...


using existing file ../hdd/pageview_complete/monthly/2018/2018-01/pageviews-201801-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2018/2018-02/pageviews-201802-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2018/2018-03/pageviews-201803-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2018/2018-04/pageviews-201804-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2018/2018-10/pageviews-201810-user.bz2 ...using existing file ../hdd/pageview_complete/monthly/2018/2018-07/pageviews-201807-user.bz2 ...

using existing file ../hdd/pageview_complete/monthly/2018/2018-05/pageviews-201805-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2018/2018-08/pageviews-201808-user.bz2 ...using existing file ../hdd/pageview_complete/monthly/2018/2018-11/pageviews-201811-user.bz2 ...

using existing file ../hdd/pageview_complete/monthly/2018/2018-06/pageviews-201806-user.bz2 ...
using existing file ../hdd/pageview_comp

downloading 12 months for year 2020 ...
downloading 9 months for year 2021 ...


using existing file ../hdd/pageview_complete/monthly/2021/2021-01/pageviews-202101-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-05/pageviews-202105-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-02/pageviews-202102-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-07/pageviews-202107-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-06/pageviews-202106-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-08/pageviews-202108-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-09/pageviews-202109-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-03/pageviews-202103-user.bz2 ...
using existing file ../hdd/pageview_complete/monthly/2021/2021-04/pageviews-202104-user.bz2 ...


In [8]:
# sc.stop()