In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import bz2
import csv
import io
import json
import re
import time
import random
import requests
import datetime
from pathlib import Path
from pprint import pprint
from typing import List, Dict
from dateutil.relativedelta import relativedelta
import lsde2021.csv as csvutils
import lsde2021.utils as utils
import lsde2021.download as dl
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [3]:
MAX_MEMORY = "30G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/28 14:03:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/10/28 14:03:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
pageview_complete_processed_src = Path("../nvme/pageview_complete_processed")
pageview_complete_per_topic_dest = Path("../nvme/pageview_complete_per_topic")
end_date = datetime.date(2021, 10, 1)

daily_pageview_files = []
for year in [2018]: # 2019, 2020, 2021]:
    daily_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 31),
    ))
    
    daily_range = [d for d in daily_range if (end_date - d).total_seconds() > 0]
    daily_pageview_files += daily_range
    
daily_pageview_files = [
    (
        pageview_complete_processed_src / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))).with_suffix(".parquet"),
        pageview_complete_per_topic_dest / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))).with_suffix(".parquet"),
    )
    for date in daily_pageview_files
]
pprint(daily_pageview_files[:10])

[(PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180101-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2018/2018-01/pageviews-20180101-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180102-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2018/2018-01/pageviews-20180102-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180103-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2018/2018-01/pageviews-20180103-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180104-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2018/2018-01/pageviews-20180104-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180105-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2018/2018-01/pageviews-20180105-user.parquet')),
 (PosixPath('../nvme/page

In [6]:
pageview_schema = T.StructType([
    T.StructField("wiki_code", T.StringType(), True),
    T.StructField("page_title",T.StringType(), True),
    T.StructField("page_id", T.IntegerType(), True),
    T.StructField("user_client", T.StringType(), True),
    T.StructField("daily_total", T.IntegerType(), True),
    T.StructField("hourly_count", T.StringType(), True),
])

for daily_processed_file, daily_processed_per_topics_output_file in daily_pageview_files:
    start = time.time()
    # print(daily_processed_file, daily_processed_per_topics_output_file)
    # break
    
    df = spark.read.format("parquet").load(str(daily_processed_file))
    
    group_cols = ["topic", "dbname", "wiki_code", "group", "language"]
    
    topic1_counts = df.select(*group_cols, "daily_total", F.explode("topics1").alias("topic"))
    topic1_counts = topic1_counts.groupBy(group_cols).agg(F.sum("daily_total").alias("level1_daily_total"), F.count(F.lit(1)).alias("level1_page_count"))
    
    topic2_counts = df.select(*group_cols, "daily_total", F.explode("topics2").alias("topic"))
    topic2_counts = topic2_counts.groupBy(group_cols).agg(F.sum("daily_total").alias("level2_daily_total"), F.count(F.lit(1)).alias("level2_page_count"))
    
    topic3_counts = df.select(*group_cols, "daily_total", F.explode("topics3").alias("topic"))
    topic3_counts = topic3_counts.groupBy(group_cols).agg(F.sum("daily_total").alias("level3_daily_total"), F.count(F.lit(1)).alias("level3_page_count"))
    
    topic4_counts = df.select(*group_cols, "daily_total", F.explode("topics4").alias("topic"))
    topic4_counts = topic4_counts.groupBy(group_cols).agg(F.sum("daily_total").alias("level4_daily_total"), F.count(F.lit(1)).alias("level4_page_count"))
    
    topic_counts = topic1_counts
    topic_counts = topic_counts.join(topic2_counts, on=group_cols, how="outer")
    topic_counts = topic_counts.join(topic3_counts, on=group_cols, how="outer")
    topic_counts = topic_counts.join(topic4_counts, on=group_cols, how="outer")
    
    levels_daily_counts = ["level1_daily_total", "level2_daily_total", "level3_daily_total", "level4_daily_total"]
    topic_counts = topic_counts.withColumn('num_levels', sum(topic_counts[col].isNotNull().cast('int') for col in levels_daily_counts))
    topic_counts = topic_counts.filter((F.col("num_levels") > 0) & (F.col("group").isNotNull()))
    
    # test for pizza
    # topic_counts.filter(F.lower("topic") == "pizza").limit(100).show()
    
    # write out to parquet file, partitioned by the country code
    topic_counts.write.format("parquet").mode("overwrite").partitionBy("group").save(str(daily_processed_per_topics_output_file))
    print("wrote %s in %.2f minutes" % (daily_processed_per_topics_output_file, (time.time() - start) / 60))

AnalysisException: Path does not exist: file:/home/jovyan/nvme/pageview_complete_processed/2018/2018-01/pageviews-20180101-user.parquet