In [None]:
# %pip install --quiet --no-cache --force git+https://github.com/romnn/lsde2021

In [None]:
%load_ext autoreload
%autoreload 2
import pyspark
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from pyspark.sql import SparkSession
from functools import partial
from pathlib import Path
import datetime

In [None]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("EDA") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .getOrCreate()
sc = spark.sparkContext

In [None]:
for year in [2018, 2019, 2020, 2021]:
    hour_range = list(dl.datetime_range(
        datetime.datetime(year, 1, 1, hour=0),
        datetime.datetime(year, 1, 8, hour=0)
    ))
    assert len(hour_range) == 7 * 24 + 1
    
    dest = Path("./wikimedia_data")
    downloaded = sc.parallelize(dl.wikimedia_files(hour_range)) \
        .map(partial(dl.download_handler, dest=dest, force=False)) \
        .collect()

    downloaded_files = [p for p in (dest / f"{year}/{year}-01").glob("**/*.gz")]
    print(downloaded_files[:10])

In [None]:
for year in [2018, 2019, 2020, 2021]:
    daily_date_range = list(dl.datetime_range(
        datetime.datetime(year, 1, 1, hour=0),
        datetime.datetime(year, 1, 1, hour=0),
        interval=datetime.timedelta(days=1)
    ))
    print(len(daily_date_range))
    assert len(daily_date_range) == 1
    
    # aggregate days here and store to parquet
    for date in daily_date_range:
        agg.aggregate_daily_pageviews(date.date(), spark=spark, src=dest, dest=dest / "daily")

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import pyspark.sql.functions as F
import traceback

# see https://stackoverflow.com/questions/51217168/wikipedia-pageviews-analysis
# domain_code
# page_title
# count_views
# total_response_size (no longer maintained)

def aggregate_daily_pageviews(date: datetime.date, src: PathLike, dest: PathLike) -> PathLike:
    schema = StructType([
        StructField("domain_code", StringType(), True),
        StructField("page_title", StringType(), True),
        StructField("view_count", LongType(), True),
        StructField("total_response_size", IntegerType(), True)
    ])

    csv_loader = spark.read.format("csv").option("sep", ' ')
    
    daily = None
    daily_out = dest / Path("/".join(dl.wikimedia_daily_local_file(date)))
    
    for hour in range(24):
        current = datetime.datetime.combine(date, datetime.time.min) + datetime.timedelta(hours=hour)
        file = src / Path("/".join(dl.wikimedia_local_file(current)))
        # print(file)
        # continue
        try:
            df = csv_loader.load(str(file), schema=schema)
            if daily is None:
                daily = df
            else:
                daily = df \
                    .select("domain_code", "page_title", F.col("view_count").alias("view_count2")) \
                    .join(daily, on=["domain_code", "page_title"], how="outer") \
                    .fillna(value=0)
                daily = daily \
                    .withColumn('view_count_sum', sum([daily["view_count"], daily["view_count2"]])) \
                    .select("domain_code", "page_title", F.col("view_count_sum").alias("view_count"))
        except Exception as e:
            print(f"failed to load {file}: {e}")
            print(traceback.format_exc())
    
    if daily:
        try:
            daily = daily \
                .sort(F.col("view_count").desc()) \
                .repartition(F.col("domain_code"))
            daily.show()
            daily_out.parent.mkdir(parents=True, exist_ok=True)
            daily.write.format("parquet").partitionBy("domain_code").mode("overwrite").save(str(daily_out))
            print(f"wrote {daily_out}")
            # print(date, daily.count())
        except Exception as e:
            print(f"failed to save daily data {daily_out}: {e}")
            print(traceback.format_exc())
    return daily_out
    
for date in daily_date_range:
    agg.aggregate_daily_pageviews(spark, date.date(), src=dest, dest=dest / "daily")

In [None]:
%reload_ext autoreload
%autoreload 2
import lsde2021.aggregate as agg
hourly = sc.parallelize([d.date() for d in daily_date_range][:1]) \
    .map(partial(agg.aggregate_daily_pageviews, src=dest, dest=dest / "daily")) \
    .collect()
print(len(hourly))
print(hourly[:10])

In [None]:
# sc.stop()

In [None]:
csv_loader = spark.read.format("csv") \
            .option("sep", ' ')
           #.option("header", "true") \
           #.option("delimiter", "|") \
           #.option("inferschema", "true")
for date, file in downloaded:
    df = csv_loader.load(str(file))
    df.show(1)
    break

In [None]:
schema = StructType([
    StructField("domain", StringType(), True),
    StructField("pagename", StringType(), True),
    StructField("count", StringType(), True),
   StructField("responsebytes", StringType(), True)
])


df = spark.read.option("sep"," ").csv("/mnt/group29/test.gz", schema=schema)
df.show(1)