In [None]:
%load_ext autoreload
%autoreload 2
import pyspark
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from functools import partial
from pprint import pprint
from pathlib import Path
from typing import Tuple
import pandas as pd
import datetime

In [None]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("EDA") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

In [None]:
def download_wikipedia_sql_dump_handler(
    item: Tuple[Tuple[datetime.date, str, str], str],
    dest: PathLike,
    force: bool = False,
) -> Tuple[datetime.date, PathLike]:
    (date, wiki, table), url = item
    filename = Path(
        "/".join(
            dl.wikimedia_sql_dump_local_file(date, wiki=wiki, table=table)
        )
    )
    destination = dest / filename
    
    def validate_file_func(_destination: PathLike) -> bool:
        ok_file = Path(_destination).with_suffix(".ok")
        if not force and _destination.exists() and ok_file.exists():
            return True
        if not force and _destination.exists():
            ok_file.touch()
            return True
        return False

    return date, dl.download_file(
        url, destination=destination, force=force, validate_file_func=validate_file_func
    )

In [None]:
dest = Path("../hdd/wikipedia_sql_dumps")
tables = ["langlinks", "page", "category", "categorylinks"]

languages = pd.read_csv("./data/languages.csv", index_col="code")
languages.head()

In [None]:
downloads = list(dl.wikimedia_sql_dump_urls(
    [datetime.date(2021, 10, 1)], wikis=languages["dbname"], tables=tables))
pprint(downloads[:10])

In [None]:
downloaded = sc.parallelize(downloads, numSlices=4).map(
    partial(
        download_wikipedia_sql_dump_handler,
        dest=dest,
        force=False,
    )
).collect()