In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import bz2
import csv
import io
import json
import re
import time
import random
import requests
import datetime
from pathlib import Path
from pprint import pprint
from typing import List, Dict
from dateutil.relativedelta import relativedelta
import lsde2021.csv as csvutils
import lsde2021.utils as utils
import lsde2021.download as dl
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [3]:
MAX_MEMORY = "30G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/28 14:05:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/10/28 14:05:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/10/28 14:05:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
languages = spark.read.format("csv").options(header='True').load("./data/languages.csv")
languages = languages.withColumn("wiki_code", F.concat(F.col("code"), F.lit(".wikipedia")))
languages = languages.select(F.col("name").alias("language"), "dbname", "group", "code", "wiki_code")
languages.limit(100).show()

language_dbnames = languages.select(F.col("dbname")).distinct().rdd.flatMap(lambda x: x).collect()
print(language_dbnames)

+-----------------+-------+-----+----+-------------+
|         language| dbname|group|code|    wiki_code|
+-----------------+-------+-----+----+-------------+
|           Arabic| arwiki|   ar|  ar| ar.wikipedia|
|  Moroccan Arabic|arywiki|   ar| ary|ary.wikipedia|
|  Egyptian Arabic|arzwiki|   ar| arz|arz.wikipedia|
|      Azerbaijani| azwiki|   az|  az| az.wikipedia|
|South Azerbaijani|azbwiki|   az| azb|azb.wikipedia|
|        Bulgarian| bgwiki|   bg|  bg| bg.wikipedia|
|          Bosnian| bswiki|   bs|  bs| bs.wikipedia|
|          Catalan| cawiki|   ca|  ca| ca.wikipedia|
|            Czech| cswiki|   cs|  cs| cs.wikipedia|
|           Danish| dawiki|   da|  da| da.wikipedia|
|           German| dewiki|   de|  de| de.wikipedia|
|            Greek| elwiki|   el|  el| el.wikipedia|
|          English| enwiki|   en|  en| en.wikipedia|
|          Spanish| eswiki|   es|  es| es.wikipedia|
|         Estonian| etwiki|   et|  et| et.wikipedia|
|          Finnish| fiwiki|   fi|  fi| fi.wiki

In [5]:
raw_english_topics_schema = T.StructType([
    T.StructField('page_id', T.IntegerType(), False),
    T.StructField('topics1', T.ArrayType(T.StringType()), False),
    T.StructField('topics2', T.ArrayType(T.StringType()), False),
    T.StructField('topics3', T.ArrayType(T.StringType()), False),
    T.StructField('topics4', T.ArrayType(T.StringType()), False),
])


raw_english_topics = spark.read.format("parquet").load(f"../nvme/en_topics/topics_final.parquet")
raw_english_topics.limit(20).show()
print("page ids with topics:", raw_english_topics.select("page_id").distinct().count())

+-------+--------------------+--------------------+--------------------+--------------------+
|page_id|             topics1|             topics2|             topics3|             topics4|
+-------+--------------------+--------------------+--------------------+--------------------+
|1467395|[Argonaut games g...|[Bandai namco gam...|[Computer-related...|[Fiction-writing ...|
|1467417|[Place name disam...|[Wikipedia disamb...|[Wikipedia conten...|[Wikipedia admini...|
|1467429|[History, Norther...|[Ireland, Unionis...|[United kingdom, ...|[Towns, Villages,...|
|1467430|[Female character...|[Introduction, Te...|[Occupation, Fict...|[American film st...|
|1467440|[Tunnels complete...|[Tunnels, Complet...|[Transport buildi...|[Transport buildi...|
|1467444|[Indie rock music...|[Rock music group...|[Establishment, P...|[Music companies,...|
|1467450|[Dc comics extrat...|[Extraterrestrial...|[Comics character...|[Multiple births,...|
|1467451|[Lists, Germany, ...|[Political office...|[Legislat

In [6]:
page_schema = T.StructType([
    T.StructField("page_id", T.IntegerType(), True),
    T.StructField("page_namespace", T.IntegerType(), True),
    T.StructField("page_title", T.StringType(), True),
    T.StructField("page_restrictions", T.StringType(), True),
    T.StructField("page_is_redirect", T.BooleanType(), True),
    T.StructField("page_is_new", T.BooleanType(), True),
    T.StructField("page_random", T.FloatType(), True),
    T.StructField("page_touched", T.TimestampType(), True),
    T.StructField("page_links_updated", T.TimestampType(), True),
    T.StructField("page_latest", T.StringType(), True),
    T.StructField("page_len", T.IntegerType(), True),
    T.StructField("page_content_model", T.StringType(), True),
    T.StructField("page_lang", T.StringType(), True),
])

raw_english_pages = spark.read.format("parquet").options(inferSchema='True').option("mergeSchema", "true").load(f"../nvme/wikipedia_sql_dumps/enwiki/20211001/enwiki-20211001-page.sql.parquet")
english_pages = raw_english_pages.withColumn("page_is_redirect", F.col("page_is_redirect").cast(T.BooleanType()))
english_pages = english_pages.withColumn("page_is_new", F.col("page_is_new").cast(T.BooleanType()))
english_pages = english_pages.withColumn("page_random", F.col("page_random").cast(T.FloatType()))
english_pages = english_pages.withColumn("page_touched", F.to_timestamp("page_touched", 'yyyyMMddHHmmss'))
english_pages = english_pages.withColumn("page_links_updated", F.to_timestamp("page_links_updated", 'yyyyMMddHHmmss'))
english_pages = english_pages.withColumn("page_len", F.col("page_len").cast(T.IntegerType()))
english_pages.limit(20).show()

+-------+--------------+--------------------+-----------------+----------------+-----------+-----------+-------------------+-------------------+-----------+--------+------------------+---------+
|page_id|page_namespace|          page_title|page_restrictions|page_is_redirect|page_is_new|page_random|       page_touched| page_links_updated|page_latest|page_len|page_content_model|page_lang|
+-------+--------------+--------------------+-----------------+----------------+-----------+-----------+-------------------+-------------------+-----------+--------+------------------+---------+
|1874202|             0|            Freedows|             null|           false|      false| 0.12535934|2021-09-29 09:55:10|2021-09-10 09:04:19|  932832600|     171|          wikitext|     NULL|
|1874204|             3|         70.48.68.86|             null|           false|       true| 0.48354313|2013-08-29 08:44:30|2019-08-14 12:08:06|   17510721|      94|          wikitext|     NULL|
|1874206|             0| 

In [7]:
english_pages = english_pages.join(raw_english_topics, on="page_id", how="outer")
english_pages.limit(20).show()

+-------+--------------+--------------------+-----------------+----------------+-----------+-----------+-------------------+-------------------+-----------+--------+------------------+---------+--------------------+--------------------+--------------------+--------------------+
|page_id|page_namespace|          page_title|page_restrictions|page_is_redirect|page_is_new|page_random|       page_touched| page_links_updated|page_latest|page_len|page_content_model|page_lang|             topics1|             topics2|             topics3|             topics4|
+-------+--------------+--------------------+-----------------+----------------+-----------+-----------+-------------------+-------------------+-----------+--------+------------------+---------+--------------------+--------------------+--------------------+--------------------+
|    737|             0|         Afghanistan|             null|           false|      false|  0.9998048|2021-10-01 05:54:07|2021-10-01 06:03:57| 1047511396|  26136

In [15]:
# count page ids without a topic
print("total page ids", english_pages.select("page_id").distinct().count())
english_pages_ns0_non_redirect = english_pages.filter((F.col("page_is_redirect") == 0) & (F.col("page_namespace") == 0))
print("total page ids in namespace 0 which are not redirects", english_pages_ns0_non_redirect.select("page_id").distinct().count())
print("page ids without topics:", english_pages.filter(F.col("topics1").isNull()).count())
print("page ids in namespace 0 without topics:", english_pages_ns0_non_redirect.filter(F.col("topics1").isNull()).count())

total page ids 54286374
total page ids in namespace 0 which are not redirects 5998354
page ids without topics: 48287971
page ids in namespace 0 without topics: 73


In [16]:
english_pages_with_topics = english_pages_ns0_non_redirect.filter((F.col("topics1").isNotNull()) & (F.col("topics2").isNotNull()) & (F.col("topics3").isNotNull()) & (F.col("topics4").isNotNull()))
print("writing %d pages with topics" % (english_pages_with_topics.count()))
# english_pages_with_topics.write.format("parquet").mode("overwrite").save(f"../nvme/wikipedia_sql_dumps/enwiki/20211001/enwiki-20211001-page-topics-ns0-nonredirect.sql.parquet")

writing 5998281 pages with topics


In [12]:
langlinks_schema = T.StructType([
    T.StructField("page_id", T.IntegerType(), True),
    T.StructField("lang", T.StringType(), True),
    T.StructField("lang_title", T.StringType(), True),
])

en_langlinks = None
for dbname in language_dbnames:
    langlinks = spark.read.format("parquet").schema(langlinks_schema).load(f"../nvme/wikipedia_sql_dumps/{dbname}/20211001/{dbname}-20211001-langlinks.sql.parquet")
    langlinks = langlinks.filter(F.col("lang") == "en")
    langlinks = langlinks.withColumn('dbname', F.lit(dbname))
    # langlinks.limit(5).show()
    if en_langlinks is None:
        en_langlinks = langlinks
    else:
        en_langlinks = en_langlinks.union(langlinks)
en_langlinks = en_langlinks \
    .filter(F.col("lang_title").isNotNull()) \
    .select("page_id", 'dbname', F.col("lang_title").alias("en_title"))
en_langlinks.limit(20).show()
en_langlinks.filter(F.col("dbname") == "enwiki").limit(20).show()

+-------+------+--------------------+
|page_id|dbname|            en_title|
+-------+------+--------------------+
|1093372|cswiki|                 !!!|
|1027440|cswiki|                   @|
| 726217|cswiki|Country Johnny Ma...|
|   4298|cswiki|Hello, World! pro...|
| 844052|cswiki|Heroes (David Bow...|
|1217305|cswiki|Heroes (David Bow...|
| 236777|cswiki|       I AM Activity|
|  68142|cswiki|The Spaghetti Inc...|
| 277060|cswiki|   Weird Al Yankovic|
| 616583|cswiki|     $pringfield (or|
|1299039|cswiki|                 &Me|
|1521485|cswiki|                 '39|
|1435359|cswiki|             '64–'95|
| 231208|cswiki|       '74 Jailbreak|
|  61932|cswiki|   '98 Live Meltdown|
| 612231|cswiki|             'Akbara|
| 325536|cswiki|      'Alawi dynasty|
| 108418|cswiki|        'Allo 'Allo!|
| 481526|cswiki|      'Asir Province|
|1186212|cswiki|'Deed I Do (Matt ...|
+-------+------+--------------------+

+-------+------+--------+
|page_id|dbname|en_title|
+-------+------+--------+
+-------+

In [39]:
# en_langlinks.write.format("parquet").mode("overwrite").save("../nvme/wikipedia_sql_dumps/en_langlinks.parquet")

In [8]:
pageview_complete_src = Path("../hdd/pageview_complete")
pageview_complete_dest = Path("../nvme/pageview_complete_processed")
end_date = datetime.date(2021, 10, 1)

daily_pageview_files = []
for year in [2018]: # 2019, 2020, 2021]:
    daily_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 31),
    ))
    
    daily_range = [d for d in daily_range if (end_date - d).total_seconds() > 0]
    daily_pageview_files += daily_range
    
daily_pageview_files = [
    (
        pageview_complete_src / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))),
        pageview_complete_dest / Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False))).with_suffix(".parquet"),
    )
    for date in daily_pageview_files
]
pprint(daily_pageview_files[:10])

[(PosixPath('../hdd/pageview_complete/2018/2018-01/pageviews-20180101-user.bz2'),
  PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180101-user.parquet')),
 (PosixPath('../hdd/pageview_complete/2018/2018-01/pageviews-20180102-user.bz2'),
  PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180102-user.parquet')),
 (PosixPath('../hdd/pageview_complete/2018/2018-01/pageviews-20180103-user.bz2'),
  PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180103-user.parquet')),
 (PosixPath('../hdd/pageview_complete/2018/2018-01/pageviews-20180104-user.bz2'),
  PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180104-user.parquet')),
 (PosixPath('../hdd/pageview_complete/2018/2018-01/pageviews-20180105-user.bz2'),
  PosixPath('../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180105-user.parquet')),
 (PosixPath('../hdd/pageview_complete/2018/2018-01/pageviews-20180106-user.bz2'),
  PosixPath('../nv

In [9]:
alphabet = {c: i for i, c in enumerate(list("abcdefghijklmnopqrstuvwxyz"))}
assert len(alphabet) == 26

def hourly_pageviews_handler(s):
    # from 0 to 23, written as 0 = A, 1 = B ... 22 = W, 23 = X, e.g. F1I1
    ans = np.zeros(24)
    if s is not None and s is not np.nan:
        s = re.sub('[\s+]', '', s)
        parts = re.split('(\d+)',s)
        for i in range(0, len(parts)-1, 2):
            ans[alphabet[parts[i].lower()]] = parts[i+1]
    return ans.astype(int).tolist()

def hourly_coding(**coded):
    ans = np.zeros(24)
    for c, val in coded.items():
        try:
            ans[alphabet[c.lower()]] = int(val)
        except Exception:
            pass
    return ans.astype(int).tolist()

assert hourly_pageviews_handler("F234I12") == hourly_coding(F=234, I=12)

hourly_pageviews_udf = F.udf(hourly_pageviews_handler, T.ArrayType(T.IntegerType()))

In [None]:
pageview_schema = T.StructType([
    T.StructField("wiki_code", T.StringType(), True),
    T.StructField("page_title",T.StringType(), True),
    T.StructField("page_id", T.IntegerType(), True),
    T.StructField("user_client", T.StringType(), True),
    T.StructField("daily_total", T.IntegerType(), True),
    T.StructField("hourly_count", T.StringType(), True),
])

for daily_file, daily_processed_output_file in daily_pageview_files:
    start = time.time()
    # print(daily_file)
    # print(daily_processed_output_file)
    # break
    
    df = spark.read.format("csv").options(delimiter=" ", header="false").schema(pageview_schema).load(str(daily_file))
    
    # combine daily pageviews for different user clients
    df = df\
        .filter((F.col("page_id").isNotNull()) & (F.col("wiki_code").isNotNull())) \
        .groupBy(["page_id", "page_title", "wiki_code"]) \
        .agg(F.sum("daily_total").alias("daily_total"))
    # df.limit(10).show()
    
    # df = df.withColumn("hourly_count", hourly_pageviews_udf(df['hourly_count']))
    df = df.join(languages, on="wiki_code", how="inner")
        
    # join the english lang title
    df = df.join(en_langlinks, on=["page_id", "dbname"], how="outer")
    # print("with en langlinks")
    # df.limit(10).show()
    
    # df = df.join(df.filter(F.col("dbname") == "enwiki").select(F.col("page_title").alias("en_title")), on=["page_id", "dbname"], how="outer")
    
    # set en title to be the same for the 
    # df = df.withColumn("en_title", F.when(df.dbname == "enwiki", df.page_title).otherwise(df.page_title))
    df = df.withColumn("en_title", F.when(df.dbname == "enwiki", df.page_title).otherwise(df.en_title))
    df = df.filter(F.col("en_title").isNotNull())
    
    # df.withColumn(F.col("page_title"), when(df.Rank <= 5,df.Id)
    # df.limit(10).show()
    # langlinks.unpersist(blocking=True)
        
    # join the english page id using the english page title
    df = df.join(
        english_pages_with_topics.select(F.col("page_title").alias("en_title"), F.col("page_id").alias("en_page_id"), "topics1", "topics2", "topics3", "topics4"),
        on="en_title",
        # on="en_page_id",
        # on=(F.col("en_title") == F.col("page_title")),
        how="inner",
    )
    
    # df.limit(10).show()
    
    # df.filter(F.col("wiki_code") == "de.wikipedia").select("wiki_code", "topics1", "page_id", "en_page_id", "page_title", "en_title").limit(10).show()
    # df.filter(F.col("wiki_code") == "en.wikipedia").select("wiki_code", "topics1", "page_id", "en_page_id", "page_title", "en_title").limit(10).show()
    
    df.write.format("parquet").mode("overwrite").partitionBy("group").save(str(daily_processed_output_file))
    print("wrote %s in %.2f minutes" % (daily_processed_output_file, (time.time() - start) / 60))
    
    # how many english page ids are we missing vs the english titles?
    # print("english page titles:", df.filter(F.col("en_title").isNotNull()).count())
    # print("english page ids:", df.filter(F.col("en_page_id").isNotNull()).count())
    
    # write out to parquet file, partitioned by the country code
    
    # the only thing missing now is high level categories from ORES and maybe in a second run custom category mappings

wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180101-user.parquet in 1.75 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180102-user.parquet in 1.70 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180103-user.parquet in 1.73 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180104-user.parquet in 1.62 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180105-user.parquet in 1.58 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180106-user.parquet in 1.72 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180107-user.parquet in 1.66 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180108-user.parquet in 1.67 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180109-user.parquet in 1.70 minutes
wrote ../nvme/pageview_complete_processed/2018/2018-01/pageviews-20180110

In [None]:
expected_monthly_downloaded = []
for year in [2018, 2019, 2020, 2021]:
    monthly_range = list(dl.date_range(
        datetime.date(year, 1, 1),
        datetime.date(year, 12, 1),
        interval=relativedelta(months=+1),
    ))
    
    monthly_range = [d for d in monthly_range if (end_date - d).total_seconds() > 0]
    expected_monthly_downloaded += monthly_range
    
    print(f"processing {len(monthly_range)} months for year {year} ...")

In [None]:
expected_daily_downloaded_filenames = [
    Path("/".join(dl.wikimedia_pageview_complete_local_file(date, monthly=False)))
    for date in expected_daily_downloaded
]
pprint(expected_daily_downloaded_filenames[:10])