In [None]:
%load_ext autoreload
%autoreload 2
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from dateutil.relativedelta import relativedelta
from functools import partial
from pprint import pprint
from pathlib import Path
from typing import Tuple
import itertools
import pandas as pd
from dask import dataframe as dd
import matplotlib.pyplot as plt
import datetime
import gc
import time
import numpy as np
from tqdm import tqdm

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import pyspark.sql.functions as F

In [None]:
plt.rcParams['figure.figsize'] = (8,4)

In [None]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')

In [None]:
src_dir = Path("../hdd/wikipedia_sql_dumps")
dest = Path("../nvme/wikipedia_sql_dumps")
tables = ["langlinks", "page", "category", "categorylinks"]

languages = pd.read_csv("./data/languages.csv", index_col="code")
languages.head()

In [None]:
#### Parse CSV files and convert to correct data dypes using chunking
wikis = languages["dbname"]
wikis = [w for w in wikis if w != "enwiki"]
assert "enwiki" not in wikis
# wikis = ["enwiki"]
parsed = list([
    (table, (src_dir / "/".join(dl.wikimedia_sql_dump_local_file(date, wiki=wiki, table=table))).with_suffix(".csv"))
    for (date, wiki, table), _ in dl.wikimedia_sql_dump_urls(
        [datetime.date(2021, 10, 1)], wikis=wikis, tables=tables
    )
])
pprint(parsed[:5])

In [None]:
def convert_to_parquet(parsed_dumps, force=False):
    for table, file_path in parsed_dumps:
        start = time.time()
        df = csv_loader.load(str(file_path))
        output_path = (dest / file_path.relative_to(src_dir)).with_suffix(".parquet")
        if not force and output_path.exists():
            print(f"using existing {output_path} ...")
            continue
        df.write.format("parquet").mode("overwrite").save(str(output_path))
        print(f"wrote {output_path} in {time.time() - start:.2f} seconds")
        gc.collect()

In [None]:
convert_to_parquet(parsed, force=True)

In [None]:
years = [2018] # , 2019, 2020, 2021]
months = range(1,13)
days = [1] # , 15]
end_date = datetime.date(2021, 10, 1)
pageview_samples = [
    datetime.date(year, month, day) for year, month, day in itertools.product(years, months, days)
]
pageview_samples = [d for d in pageview_samples if (end_date - d).total_seconds() > 0]

pageview_complete_src_dir = Path("../hdd/pageview_complete")
pageview_complete_dest_dir = Path("../nvme/pageview_complete")

pageview_sample_files = [
    pageview_complete_dir / "/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))
    for date in pageview_samples
]
print(f"building {len(pageview_sample_files)} files")
pprint(pageview_sample_files)

In [None]:
languages = pd.read_csv("./data/languages.csv", index_col="code")
languages.head()

In [None]:
%%time
for date, f in tqdm(list(zip(pageview_samples, pageview_sample_files))):
    df = pd.read_csv(
        f,
        sep=' ',
        names=pageview_columns,
        low_memory=False,
        dtype=pageview_dtype,
        on_bad_lines="skip",
        # nrows=1_000_000,
        usecols=["wiki_code", "page_id", "daily_total"], # "page_title"
        engine="c",
    )
    # df = df.compute()
    
    # parse and reduce 
    df = df[df["wiki_code"].notna()]
    df["wiki_code"] = df["wiki_code"].apply(lambda c: c.split(".")[0])
    df["page_id"] = pd.to_numeric(df["page_id"], errors='coerce')
    df = df[df["page_id"].notna()]
    df["page_id"] = df["page_id"].astype("int32")
    df["daily_total"] = pd.to_numeric(df["daily_total"], errors='coerce')
    df["daily_total"] = df["daily_total"].fillna(0).astype("Int32")
    df = df.rename(columns={
        "daily_total": f"{date.year}{str(date.month).zfill(2)}{str(date.day).zfill(2)}_daily_total",
    })
    df = df.set_index(index_cols)
    df = df.groupby(index_cols).sum()
    if dfs is None:
        dfs = df
    else:
        dfs = dfs.join(df, on=index_cols, how="outer") # "page_title"

    del df
    gc.collect()
    
    # print(dfs.dtypes)
    print(dfs.shape)

dfs.head()