In [1]:
# %pip install --quiet --no-cache --force git+https://github.com/romnn/lsde2021

In [1]:
%load_ext autoreload
%autoreload 2
import pyspark
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from pyspark.sql import SparkSession
from functools import partial
from pathlib import Path
import datetime

In [3]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("EDA") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .getOrCreate()
sc = spark.sparkContext

21/10/02 22:05:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
date_range = list(dl.datetime_range(
    datetime.datetime(2019, 1, 1, hour=0),
    datetime.datetime(2019, 1, 3, hour=0)
))
assert len(date_range) == 2 * 24 + 1
# assert len(date_range) == 365 * 24 + 1

In [5]:
dest = Path("/dbfs/mnt/group29")
dest = Path("./wikimedia_data")
# dl.download_handler(list(dl.wikimedia_files(date_range))[10], dest=dest, force=False)
downloaded = sc.parallelize(dl.wikimedia_files(date_range)) \
    .map(partial(dl.download_handler, dest=dest, force=False)) \
    .collect()

print(downloaded[:3])

downloading wikimedia_data/2019/2019-01/pageviews-20190102-060000.gz + 16) / 16]
downloading wikimedia_data/2019/2019-01/pageviews-20190101-090000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190102-150000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190101-000000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190101-150000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190102-090000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190101-180000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190101-030000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190102-000000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190102-030000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190102-180000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190101-120000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190101-060000.gz
downloading wikimedia_data/2019/2019-01/pageviews-20190102-120000.gz
downloading wikimedia_

[(datetime.datetime(2019, 1, 1, 0, 0), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-000000.gz')), (datetime.datetime(2019, 1, 1, 1, 0), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-010000.gz')), (datetime.datetime(2019, 1, 1, 2, 0), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-020000.gz'))]


using existing file wikimedia_data/2019/2019-01/pageviews-20190103-000000.gz ...
                                                                                

In [6]:
downloaded_files = [p for p in (dest / "2019/2019-01").glob("**/*.gz")]

In [7]:
print(len(downloaded_files), len(downloaded))
assert len(downloaded_files) == len(downloaded)
print(downloaded_files[:10])

49 49
[PosixPath('wikimedia_data/2019/2019-01/pageviews-20190102-000000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-140000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-060000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190102-100000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-190000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-090000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-200000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190102-020000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190101-110000.gz'), PosixPath('wikimedia_data/2019/2019-01/pageviews-20190102-230000.gz')]


In [8]:
# todo: aggregate days here and store to parquet
daily_date_range = list(dl.datetime_range(
    datetime.datetime(2019, 1, 1, hour=0),
    datetime.datetime(2019, 1, 1, hour=0),
    interval=datetime.timedelta(days=1)
))
print(len(daily_date_range))
assert len(daily_date_range) == 1

1


In [9]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import pyspark.sql.functions as F
import traceback

# see https://stackoverflow.com/questions/51217168/wikipedia-pageviews-analysis
# domain_code
# page_title
# count_views
# total_response_size (no longer maintained)

def aggregate_daily_pageviews(date: datetime.date, src: PathLike, dest: PathLike) -> PathLike:
    schema = StructType([
        StructField("domain_code", StringType(), True),
        StructField("page_title", StringType(), True),
        StructField("view_count", LongType(), True),
        StructField("total_response_size", IntegerType(), True)
    ])

    csv_loader = spark.read.format("csv").option("sep", ' ')
    
    daily = None
    daily_out = dest / Path("/".join(dl.wikimedia_daily_local_file(date)))
    
    for hour in range(24):
        current = datetime.datetime.combine(date, datetime.time.min) + datetime.timedelta(hours=hour)
        file = src / Path("/".join(dl.wikimedia_local_file(current)))
        # print(file)
        # continue
        try:
            df = csv_loader.load(str(file), schema=schema)
            if daily is None:
                daily = df
            else:
                daily = df \
                    .select("domain_code", "page_title", F.col("view_count").alias("view_count2")) \
                    .join(daily, on=["domain_code", "page_title"], how="outer") \
                    .fillna(value=0)
                daily = daily \
                    .withColumn('view_count_sum', sum([daily["view_count"], daily["view_count2"]])) \
                    .select("domain_code", "page_title", F.col("view_count_sum").alias("view_count"))
        except Exception as e:
            print(f"failed to load {file}: {e}")
            print(traceback.format_exc())
    
    if daily:
        try:
            daily = daily \
                .sort(F.col("view_count").desc()) \
                .repartition(F.col("domain_code"))
            daily.show()
            daily_out.parent.mkdir(parents=True, exist_ok=True)
            daily.write.format("parquet").partitionBy("domain_code").mode("overwrite").save(str(daily_out))
            print(f"wrote {daily_out}")
            # print(date, daily.count())
        except Exception as e:
            print(f"failed to save daily data {daily_out}: {e}")
            print(traceback.format_exc())
    return daily_out
    
for date in daily_date_range:
    agg.aggregate_daily_pageviews(spark, date.date(), src=dest, dest=dest / "daily")

                                                                                

wrote wikimedia_data/daily/2019/2019-1-1.parquet


In [29]:
# sc.stop()

21/10/02 20:08:18 ERROR FileFormatWriter: Aborting job 9ce22dc4-ccf1-488f-bf3f-343ddfb3647a.
org.apache.spark.SparkException: Job 41 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:1085)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1$adapted(DAGScheduler.scala:1083)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:1083)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2463)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2369)
	at org.apache.spark.SparkContext.$anonfun$stop$12(SparkContext.scala:2069)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1419)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:2069)
	at or

In [None]:
csv_loader = spark.read.format("csv") \
            .option("sep", ' ')
           #.option("header", "true") \
           #.option("delimiter", "|") \
           #.option("inferschema", "true")
for date, file in downloaded:
    df = csv_loader.load(str(file))
    df.show(1)
    break

In [None]:
schema = StructType([
    StructField("domain", StringType(), True),
    StructField("pagename", StringType(), True),
    StructField("count", StringType(), True),
   StructField("responsebytes", StringType(), True)
])


df = spark.read.option("sep"," ").csv("/mnt/group29/test.gz", schema=schema)
df.show(1)