In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import bz2
import csv
import io
import json
import re
import time
import random
import requests
import datetime
from pathlib import Path
from pprint import pprint
from typing import List, Dict
from dateutil.relativedelta import relativedelta
import lsde2021.csv as csvutils
import lsde2021.utils as utils
import lsde2021.download as dl
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [2]:
MAX_MEMORY = "30G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/28 12:23:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
pageview_complete_per_topic = Path("../nvme/pageview_complete_per_topic")
end_date = datetime.date(2021, 10, 1)

daily_pageview_files = []
for year in [2019]: # 2019, 2020, 2021]:
    daily_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 31),
    ))
    
    daily_range = [d for d in daily_range if (end_date - d).total_seconds() > 0]
    daily_pageview_files += daily_range
    
daily_pageview_files = [
    (
        pageview_complete_processed_src / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))).with_suffix(".parquet"),
        pageview_complete_per_topic_dest / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))).with_suffix(".parquet"),
    )
    for date in daily_pageview_files
]
pprint(daily_pageview_files[:10])

[(PosixPath('../nvme/pageview_complete_processed/2019/2019-01/pageviews-20190101-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2019/2019-01/pageviews-20190101-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2019/2019-01/pageviews-20190102-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2019/2019-01/pageviews-20190102-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2019/2019-01/pageviews-20190103-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2019/2019-01/pageviews-20190103-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2019/2019-01/pageviews-20190104-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2019/2019-01/pageviews-20190104-user.parquet')),
 (PosixPath('../nvme/pageview_complete_processed/2019/2019-01/pageviews-20190105-user.parquet'),
  PosixPath('../nvme/pageview_complete_per_topic/2019/2019-01/pageviews-20190105-user.parquet')),
 (PosixPath('../nvme/page

In [108]:
pageview_complete_per_topic = Path("../nvme/pageview_complete_per_topic")

changepoint = datetime.date(2020, 1, 15)
control_changepoint = changepoint.replace(year=2019)

print(changepoint, changepoint_control)

window_size = relativedelta(days=10)
group = "de"
group = "nl"
# group = "en"

def to_date(d):
    d = pre_day.timetuple()
    return f"{d.tm_year}-{d.tm_mon}-{d.tm_mday}"
    
# load pre data
# pre_days = list(dl.date_range(changepoint - window_size, changepoint))

control_start, control_end = changepoint_control - window_size, changepoint_control + window_size
target_start, target_end = changepoint - window_size, changepoint + window_size

control_days = list(dl.date_range(control_start, control_end))
target_days = list(dl.date_range(target_start, target_end))
all_days = control_days + target_days
# pprint(all_days)

changepoint_df = None
# pprint(pre_days)
for day in all_days:
    data = pageview_complete_per_topic / Path("/".join(dl.wikimedia_pageview_complete_local_file(day, monthly=False))).with_suffix(".parquet")
    df = spark.read.format("parquet").load(str(data / f"group={group}"))
    df = df.filter(F.col("level4_daily_total").isNotNull())
    # df = df.select("topic", "dbname", "wiki_code", "language", "level4_daily_total")
    # df = df.withColumn("date", F.lit(to_date(day)))
    df = df.withColumn("date", F.lit(day))
    
    if changepoint_df is None:
        changepoint_df = df
    else:
        # pre_df = pre_df.join(df, on=["topic", "dbname"], how="outer")
        changepoint_df = changepoint_df.union(df)
        pass
    
    # daily_pageview_combined = daily_pageview_combined.join(prepare(daily_pageview_file, date).repartition("group", "level"), on=group_cols + ["level"], how="outer")

2020-01-15 2019-01-15


In [91]:
# pre_df = pre_df.groupBy("topic", "dbname", "wiki_code", "language").agg(F.sum("level4_daily_total"))
# changepoint_df.show()
# changepoint_df.printSchema()
# start simple, compute the mean pre and post changepoint and take largest relative difference
print(target_start, changepoint)
level = 1

pre_pageviews = changepoint_df \
    .filter((F.lit(target_start) <= F.col("date")) & (F.col("date") <= F.lit(changepoint))) \
    .groupBy("topic", "dbname", "wiki_code", "language") \
    .agg(F.mean(f"level{level}_daily_total").alias("mean_pageviews_pre"))
# pre_pageviews.select("date").distinct().show()
# pre_pageviews.show()

post_pageviews = changepoint_df \
    .filter((F.lit(changepoint) <= F.col("date")) & (F.col("date") <= F.lit(target_end))) \
    .groupBy(f"topic", "dbname", "wiki_code", "language") \
    .agg(F.mean(f"level{level}_daily_total").alias("mean_pageviews_post"))
# post_pageviews.select("date").distinct().show()
# post_pageviews.show()

diff = pre_pageviews.join(post_pageviews, on=["topic", "dbname", "wiki_code", "language"], how="inner")
diff = diff.withColumn('diff', ( F.col("mean_pageviews_post") / F.col("mean_pageviews_pre") ) )
# diff = diff.sort(F.col("diff").desc())
# diff.show()

diff.filter(F.col("mean_pageviews_post") > 1_000).sort(F.col("diff").desc()).show()
# diff.sort(F.col("mean_pageviews_post").desc()).show()
# dd1 = dd1.withColumn('Result', ( dd1['A'] - dd1['B'] ) / dd1['A'] )
# 
# diff = pre_pageviews

# .select("date").distinct().show()
# print(changepoint_df.select("topic").distinct().count())

2020-01-05 2020-01-15
+--------------------+------+------------+--------+------------------+-------------------+------------------+
|               topic|dbname|   wiki_code|language|mean_pageviews_pre|mean_pageviews_post|              diff|
+--------------------+------+------------+--------+------------------+-------------------+------------------+
|      Virus families|dewiki|de.wikipedia|  German|1737.4545454545455|  49305.09090909091| 28.37777312683131|
| Japan culture stubs|dewiki|de.wikipedia|  German| 97.36363636363636|             1854.7| 19.04920634920635|
|  Network management|dewiki|de.wikipedia|  German| 295.8181818181818| 2331.7272727272725| 7.882298709280884|
|            Plateaus|dewiki|de.wikipedia|  German|251.27272727272728| 1201.5454545454545| 4.781837916063676|
|     Animal virology|dewiki|de.wikipedia|  German| 632.0909090909091| 2264.5454545454545| 3.582626204516036|
|             Housing|dewiki|de.wikipedia|  German| 608.7272727272727| 2093.7272727272725|3.439516

In [110]:
# diff in diff
level = 4
level = 1

pre_target_mean = changepoint_df \
    .filter((F.lit(target_start) <= F.col("date")) & (F.col("date") <= F.lit(changepoint))) \
    .fillna(0) \
    .groupBy("topic", "dbname", "wiki_code", "language") \
    .agg(F.mean(f"level{level}_daily_total").alias("pre_target_mean"))

pre_control_mean = changepoint_df \
    .filter((F.lit(control_start) <= F.col("date")) & (F.col("date") <= F.lit(control_changepoint))) \
    .fillna(0) \
    .groupBy("topic", "dbname", "wiki_code", "language") \
    .agg(F.mean(f"level{level}_daily_total").alias("pre_control_mean"))

post_target_mean = changepoint_df \
    .filter((F.lit(changepoint) <= F.col("date")) & (F.col("date") <= F.lit(target_end))) \
    .fillna(0) \
    .groupBy(f"topic", "dbname", "wiki_code", "language") \
    .agg(F.mean(f"level{level}_daily_total").alias("post_target_mean"))

post_control_mean = changepoint_df \
    .filter((F.lit(control_changepoint) <= F.col("date")) & (F.col("date") <= F.lit(control_end))) \
    .fillna(0) \
    .groupBy(f"topic", "dbname", "wiki_code", "language") \
    .agg(F.mean(f"level{level}_daily_total").alias("post_control_mean"))

# post_pageviews.select("date").distinct().show()
# post_pageviews.show()

diff = pre_target_mean.join(pre_control_mean, on=["topic", "dbname", "wiki_code", "language"], how="inner")
diff = diff.join(post_target_mean, on=["topic", "dbname", "wiki_code", "language"], how="inner")
diff = diff.join(post_control_mean, on=["topic", "dbname", "wiki_code", "language"], how="inner")
# .alias("pre_target_mean"), F.round("pre_control_mean"), F.round("post_target_mean"), F.round("post_control_mean"))

# diff = diff.withColumn('diff', ( F.col("mean_pageviews_post") / F.col("mean_pageviews_pre") ) )
diff = diff.withColumn('diff', ( F.col("post_target_mean") / F.col("pre_target_mean") ) )
diff = diff.withColumn('control_diff', ( F.col("pre_control_mean") - F.col("post_control_mean") ) )
diff = diff.withColumn('target_diff', ( F.col("post_target_mean") - (F.col("pre_target_mean") + F.col("control_diff")) ) )

diff.sort(F.col("target_diff").desc()).select(
    "topic", "dbname", "wiki_code", "language",
    F.round("target_diff", 0).alias("target_diff"),
    F.round("pre_target_mean", 0).alias("pre_target_mean"),
    F.round("pre_control_mean", 0).alias("pre_control_mean"),
    F.round("post_target_mean", 0).alias("post_target_mean"),
    F.round("post_control_mean", 0).alias("post_control_mean"),
).show()

diff.sort(F.col("diff").desc()).select(
    "topic", "dbname", "wiki_code", "language",
    F.round("diff", 0).alias("diff"),
    F.round("pre_target_mean", 0).alias("pre_target_mean"),
    # F.round("pre_control_mean", 0).alias("pre_control_mean"),
    F.round("post_target_mean", 0).alias("post_target_mean"),
    # F.round("post_control_mean", 0).alias("post_control_mean"),
).show()

# diff.filter(F.col("mean_pageviews_post") > 1_000).sort(F.col("diff").desc()).show()
# diff.sort(F.col("mean_pageviews_post").desc()).show()
# dd1 = dd1.withColumn('Result', ( dd1['A'] - dd1['B'] ) / dd1['A'] )
# 
# diff = pre_pageviews

# .select("date").distinct().show()
# print(changepoint_df.select("topic").distinct().count())

+--------------------+------+------------+--------+-----------+---------------+----------------+----------------+-----------------+
|               topic|dbname|   wiki_code|language|target_diff|pre_target_mean|pre_control_mean|post_target_mean|post_control_mean|
+--------------------+------+------------+--------+-----------+---------------+----------------+----------------+-----------------+
|     Animal virology|nlwiki|nl.wikipedia|   Dutch|    28434.0|          332.0|           170.0|         28774.0|            162.0|
|      United kingdom|nlwiki|nl.wikipedia|   Dutch|     1848.0|         5538.0|          6401.0|          4807.0|           8980.0|
|             Alberta|nlwiki|nl.wikipedia|   Dutch|     1360.0|          398.0|           322.0|           427.0|           1653.0|
|      Euroscepticism|nlwiki|nl.wikipedia|   Dutch|     1307.0|          665.0|          1655.0|           569.0|           3057.0|
|            Villages|nlwiki|nl.wikipedia|   Dutch|     1169.0|         1839

In [60]:
pageview_complete_per_topic = Path("../nvme/pageview_complete_per_topic")
group = "nl"

days_2019 = list(dl.date_range(control_start, control_end))
target_days = list(dl.date_range(target_start, target_end))
all_days = list(dl.date_range(datetime.date(2019, 1, 1), datetime.date(2019, 2, 1))) + list(dl.date_range(datetime.date(2020, 1, 1), datetime.date(2020, 2, 1)))

changepoint_df = None
for day in all_days:
    data = pageview_complete_per_topic / Path("/".join(dl.wikimedia_pageview_complete_local_file(day, monthly=False))).with_suffix(".parquet")
    df = spark.read.format("parquet").load(str(data / f"group={group}"))
    df = df.filter(F.col("level4_daily_total").isNotNull())
    df = df.select("topic", "dbname", "wiki_code", "language", "level1_daily_total", "level2_daily_total", "level3_daily_total", "level4_daily_total")
    df = df.withColumn("date", F.lit(day))
    
    if changepoint_df is None:
        changepoint_df = df
    else:
        changepoint_df = changepoint_df.union(df)
        pass
    
# changepoint_df.limit(10).show()
changepoint_df.write.csv("../nvme/yannick-nl.csv")