In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import bz2
import csv
import io
import json
import re
import time
import random
import requests
import datetime
from pathlib import Path
from pprint import pprint
from typing import List, Dict
from dateutil.relativedelta import relativedelta
import lsde2021.csv as csvutils
import lsde2021.utils as utils
import lsde2021.download as dl
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [None]:
MAX_MEMORY = "30G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

In [None]:
pageview_complete_processed_src = Path("../nvme/pageview_complete_processed")
# pageview_complete_per_topic_dest = Path("../nvme/pageview_complete_per_topic")
end_date = datetime.date(2021, 10, 1)

daily_pageview_files = []
for year in [2019]: # 2019, 2020, 2021]:
    daily_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 31),
    ))
    
    daily_range = [d for d in daily_range if (end_date - d).total_seconds() > 0]
    daily_pageview_files += daily_range
    
daily_pageview_files = [
    (
        pageview_complete_processed_src / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))).with_suffix(".parquet"),
        date,
        # pageview_complete_per_topic_dest / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))).with_suffix(".parquet"),
    )
    for date in daily_pageview_files
]
pprint(daily_pageview_files[:10])

In [None]:
group_cols = ["topic", "dbname", "wiki_code", "group", "language"]

def prepare(daily_processed_file, date):
    d = date.timetuple()
    day = f"{d.tm_year}-{d.tm_mon}-{d.tm_mday}"
    # print(day)
    
    df = spark.read.format("parquet").load(str(daily_processed_file))
    
    topic1_counts = df.select(*group_cols, "daily_total", F.explode("topics1").alias("topic"))
    topic1_counts = topic1_counts.groupBy(group_cols).agg(F.sum("daily_total").alias(day))
    topic1_counts = topic1_counts.withColumn("level", F.lit(1))
    # topic1_counts.limit(10).show()
    
    topic2_counts = df.select(*group_cols, "daily_total", F.explode("topics2").alias("topic"))
    topic2_counts = topic2_counts.groupBy(group_cols).agg(F.sum("daily_total").alias(day))
    topic2_counts = topic2_counts.withColumn("level", F.lit(2))
    
    topic3_counts = df.select(*group_cols, "daily_total", F.explode("topics3").alias("topic"))
    topic3_counts = topic3_counts.groupBy(group_cols).agg(F.sum("daily_total").alias(day))
    topic3_counts = topic3_counts.withColumn("level", F.lit(3))
    
    topic4_counts = df.select(*group_cols, "daily_total", F.explode("topics4").alias("topic"))
    topic4_counts = topic4_counts.groupBy(group_cols).agg(F.sum("daily_total").alias(day))
    topic4_counts = topic4_counts.withColumn("level", F.lit(4))
    
    topic_counts = topic1_counts
    topic_counts = topic_counts.union(topic2_counts)
    topic_counts = topic_counts.union(topic3_counts)
    topic_counts = topic_counts.union(topic4_counts)
    
    topic_counts = topic_counts.filter((F.col("group").isNotNull()))
    # topic_counts.limit(10).show()
    
    # test for pizza
    # topic_counts.filter(F.lower("topic") == "pizza").limit(100).show()
    
    return topic_counts

In [None]:
# start with the first date
total_start = time.time()
daily_pageview_file, date = daily_pageview_files[0]
daily_pageview_combined = prepare(daily_pageview_file, date)

# iteratively join other days
for daily_pageview_file, date in daily_pageview_files[1:]:
    start = time.time()
    daily_pageview_combined = daily_pageview_combined.join(prepare(daily_pageview_file, date).repartition("group", "level"), on=group_cols + ["level"], how="outer")
    daily_pageview_combined = daily_pageview_combined.repartition("group", "level")
    print("processed %s (%d rows) in %.2f minutes" % (daily_pageview_file, daily_pageview_combined.count(), (time.time() - start) / (60)))

# 6M * 4 * langs X 365
    
# daily_pageview_combined.limit(10).show()
daily_pageview_combined.write.format("parquet").mode("overwrite").partitionBy("group", "level").save("../nvme/pageview_complete_per_topic_combined/2019.parquet")
print("done in %.2f hours" % ((time.time() - total_start) / (60**2)))

In [None]:
sc.stop()