In [3]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import bz2
import csv
import io
import json
import time
import urllib
import random
import requests
import concurrent.futures
import pickle as pkl
import numpy as np
from pathlib import Path
from pprint import pprint
from typing import List, Dict
import lsde2021.csv as csvutil
import lsde2021.utils as utils
import lsde2021.download as dl
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.dynamicAllocation.maxExecutors', 4) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/22 19:45:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
wiki = "enwiki"
pages = parquet_reader.load(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page.sql.parquet")
pages.limit(10).show()

+-------+--------------+--------------------+-----------------+----------------+-----------+--------------+--------------+------------------+-----------+--------+------------------+---------+
|page_id|page_namespace|          page_title|page_restrictions|page_is_redirect|page_is_new|   page_random|  page_touched|page_links_updated|page_latest|page_len|page_content_model|page_lang|
+-------+--------------+--------------------+-----------------+----------------+-----------+--------------+--------------+------------------+-----------+--------+------------------+---------+
|1874202|             0|            Freedows|             null|               0|          0|0.125359336589|20210929095510|    20210910090419|  932832600|     171|          wikitext|     NULL|
|1874204|             3|         70.48.68.86|             null|               0|          1|0.483543119631|20130829084430|    20190814120806|   17510721|      94|          wikitext|     NULL|
|1874206|             0|             Yun

In [6]:
en_page_ids = pages \
    .filter(F.col("page_id").isNotNull() & (F.col("page_namespace") == 0)) \
    .select("page_id") \
    .distinct() \
    .withColumn("page_id", F.col("page_id").cast(T.IntegerType())) \
    .sort('page_id', ascending=True) \
    .rdd.flatMap(lambda x: x).collect()

In [8]:
print(len(en_page_ids))
pprint(en_page_ids[:20])
assert isinstance(en_page_ids[0], int)

ores_topics_dir = Path("../nvme/ores_topics")
ores_topics_dir.mkdir(parents=True, exist_ok=True)

with open(ores_topics_dir / "en_page_ids_sorted.pkl", 'wb') as f:
    pkl.dump(en_page_ids, f, protocol=pkl.HIGHEST_PROTOCOL)

16123981
[10, 12, 13, 14, 15, 18, 19, 20, 21, 23, 24, 25, 27, 29, 30, 35, 36, 39, 40, 42]


In [23]:
def get_page_rev_ids(page_ids: List[int]) -> Dict[int, int]:
    page_ids_str = '|'.join(map(str, page_ids))
    # revids = {page_id: None for page_id in page_ids}
    revids = dict()
    with requests.get(f"https://en.wikipedia.org/w/api.php?action=query&prop=revisions&pageids={page_ids_str}&format=json") as r:
        r.raise_for_status()
        content = json.loads(r.content)
        # pprint(content)
        if "query" in content:
            if "pages" in content["query"]:
                pages = content["query"]["pages"].items()
                # pprint(pages)
                for page_id, metadata in pages:
                    try:
                        if "revisions" in metadata and len(metadata["revisions"]) >= 1:
                            revisions = metadata["revisions"][0]
                            revids[int(page_id)] = int(revisions.get("revid", None))
                    except Exception:
                        # raise e
                        # print("error:", e)
                        pass
        return revids
    
pprint(get_page_rev_ids([604727, 604728]))

{604727: 1050929646, 604728: 16433551}


In [24]:
class ORESException(ValueError):
    def __init__(self, message):
        super().__init__(message)

def get_ores_articletopics(context: str, models: List[str], rev_ids: List[int]) -> Dict[int, int]:
    url = "https://ores.wikimedia.org/v3/scores/{0}/".format(urllib.parse.quote(context))

    params = {'revids': "|".join(str(rid) for rid in rev_ids),
              'models': "|".join(urllib.parse.quote(model) for model in models)}
    
    headers = {"User-Agent": random.choice(dl.USER_AGENTS)}
    with requests.get(url, params=params, headers=headers) as r:
        r.raise_for_status()
        content = json.loads(r.content)
        
        if 'error' in content:
            raise ORESException(content['error'])
        if 'warnings' in content:
            for warning in content['warnings']:
                print(warning)
        
        return [content[context]['scores'][str(rev_id)] for rev_id in rev_ids]
    
get_ores_articletopics(context="enwiki", models=["articletopic"], rev_ids=["1050929646"])

[{'articletopic': {'score': {'prediction': ['Culture.Food and drink'],
    'probability': {'Culture.Biography.Biography*': 0.047557313869491004,
     'Culture.Biography.Women': 0.0031489940080748884,
     'Culture.Food and drink': 0.9971147752416152,
     'Culture.Internet culture': 0.00437180839799476,
     'Culture.Linguistics': 0.0017995195188879982,
     'Culture.Literature': 0.008673609494913475,
     'Culture.Media.Books': 0.002224219766073729,
     'Culture.Media.Entertainment': 0.0025838345777250632,
     'Culture.Media.Films': 0.000669437334015632,
     'Culture.Media.Media*': 0.01517950066651844,
     'Culture.Media.Music': 0.0008790073559047007,
     'Culture.Media.Radio': 5.577500110109286e-05,
     'Culture.Media.Software': 0.0035506169388284377,
     'Culture.Media.Television': 0.0005666059572808978,
     'Culture.Media.Video games': 0.0001700071583484312,
     'Culture.Performing arts': 0.0012035972447950057,
     'Culture.Philosophy and religion': 0.02743998519166171,
 

In [28]:
min_prob=0.6
max_topics=5
retries=10

def get_ores_articletopics_for_page_ids (page_ids):
    all_rev_ids = dict()
    all_topics = dict()
    for i in range(0, len(page_ids), 50):
        attempts, success = 0, False
        while not success and attempts < retries:
            attempts += 1
            try:
                rev_ids = get_page_rev_ids(page_ids[i:i+50]).items()
                # success = True
                # continue
                scores = list(get_ores_articletopics(context="enwiki", models=["articletopic"], rev_ids=[rid for _, rid in rev_ids]))
                # print(scores)
                # print(rev_ids)
                for (page_id, rev_id), score in zip(rev_ids, scores):
                    all_rev_ids[page_id] = rev_id
                    if "articletopic" in score:
                        response = score["articletopic"]
                        if "error" not in response and "score" in response:
                            if "probability" in response["score"]:
                                topic_probs = response["score"]["probability"]
                                topic_probs = sorted(topic_probs.items(), key=lambda t: t[1], reverse=True)
                                topic_probs = [t for t, prob in topic_probs if prob > min_prob]
                                all_topics[page_id] = topic_probs[:max_topics]
                                success = True
                        else:
                            print("bad response", response)
            except Exception as e:
                # raise e
                print("error", e)
    return all_rev_ids, all_topics

In [None]:
# %%time
n_parallel = 2
start = 0 * 100_000
count = 100_000
# count = 100
chunk_size = int(np.ceil(count / n_parallel))
all_revids = dict()
all_topics = dict()
procs = []

with concurrent.futures.ProcessPoolExecutor(max_workers=n_parallel) as executor:
    for worker_id in range(n_parallel):
        worker_page_ids = en_page_ids[start + worker_id * chunk_size: start + (worker_id + 1) * chunk_size]
        print(worker_page_ids[:10])
        print("worker %d got assigned %d page ids" % (worker_id, len(worker_page_ids)))
        procs.append(executor.submit(get_ores_articletopics_for_page_ids, worker_page_ids))

# collect the results
for i, proc in enumerate(procs):
    cur_revids, cur_topics = proc.result()
    all_revids.update(cur_revids)
    all_topics.update(cur_topics)
    print("worker %d done" % i)

# save result to pickle
revids_file = ores_topics_dir / ("revids_%d_to_%d.pkl" % (start, start+count))
topics_file = ores_topics_dir / ("topics_%d_to_%d.pkl" % (start, start+count))
print(revids_file)
print(topics_file)

with open(revids_file, 'wb') as f:
    pkl.dump(all_revids, f, protocol=pkl.HIGHEST_PROTOCOL)
with open(topics_file, 'wb') as f:
    pkl.dump(all_topics, f, protocol=pkl.HIGHEST_PROTOCOL)

print(len(all_revids))
# https://ores.wikimedia.org/v3/scores/enwiki?models=articletopic&revids=421063984

[10, 12, 13, 14, 15, 18, 19, 20, 21, 23]
worker 0 got assigned 50000 page ids
[76381, 76383, 76384, 76385, 76387, 76389, 76390, 76391, 76393, 76394]
worker 1 got assigned 50000 page ids
error HTTPSConnectionPool(host='ores.wikimedia.org', port=443): Max retries exceeded with url: /v3/scores/enwiki/?revids=1048751610%7C1044176009%7C1002075422%7C1044121026%7C1045267382%7C1028238828%7C1023543058%7C1021472419%7C1038798607%7C1012948550%7C1030399177%7C1049134824%7C1047811055%7C1043525869%7C1016354739%7C1048621416%7C1022117295%7C1042453259%7C1037109151%7C1048832646%7C1045345526%7C1025424394%7C1042448679%7C1035966210%7C1047239554%7C1023126423%7C1042944396%7C1019546603%7C1049258645%7C1019545288%7C1018858613%7C1038418120%7C1020153178%7C1017395073%7C1050287468%7C1035441936%7C1008519508%7C1038534807%7C1012819546%7C1045486795%7C1044229166%7C1004729053%7C1045617763%7C1049221604%7C1047058451%7C1049791369%7C1048587863%7C1001513962%7C1002078510%7C1034627844&models=articletopic (Caused by NewConnectionE

In [6]:
# deprecated

max_topics = 5

def query_topics(page_id, min_prob=0.6, max_topics=5, retries=10):
    import time
    topics = [None] * max_topics
    attempts, success = 0, False
    while not success and attempts < retries:
        attempts += 1
        try:
            rev_ids = get_page_rev_ids([page_id])
            scores = session.score("enwiki", ["articletopic"], revids=rev_ids.values())

            for (page_id, rev_id), score in zip(rev_ids.items(), scores):
                response = score["articletopic"]
                if "error" not in response:
                    topic_probs = response["score"]["probability"]
                    topic_probs = sorted(topic_probs.items(), key=lambda t: t[1], reverse=True)
                    topic_probs = [t for t, prob in topic_probs if prob > min_prob]
                    for idx, topic in enumerate(topic_probs[:max_topics]):
                        topics[idx] = topic
                    success = True
                else:
                    print(response)
        except Exception as e:
            time.sleep(5)
            print(e)
    if not success:
        raise ValueError("failed to get ores topics")
    time.sleep(0.1)
    return topics

total_pages = 54286374
min_sec = ((total_pages / 8) * 0.1)
print("%.2f hours" % (min_sec / (60 * 60)))

query_topics_udf = F.udf(query_topics, T.ArrayType(T.StringType()))

188.49 hours


16123981
[10, 12, 13, 14, 15, 18, 19, 20, 21, 23, 24, 25, 27, 29, 30, 35, 36, 39, 40, 42]


CPU times: user 7 µs, sys: 4 µs, total: 11 µs
Wall time: 15 µs


[10, 12, 13, 14, 15, 18, 19, 20, 21, 23]
worker 0 got assigned 50000 page ids
[76381, 76383, 76384, 76385, 76387, 76389, 76390, 76391, 76393, 76394]
worker 1 got assigned 50000 page ids


TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [107]:
total_pages = len(en_page_ids)
mins = (total_pages / 100_000) * 40
hours = mins / 60
print(hours)

107.49320666666668


In [7]:
# deprecated

pages_with_topics = pages.withColumn("ores_topics", query_topics_udf(pages['page_id']))

topic_schema = T.StructType([
    T.StructField("page_id", T.IntegerType(), True)
] + [
    T.StructField(f"ores_topic{i+1}", T.StringType(), True) for i in range(max_topics)
])

ores_topics = pages_with_topics.select("page_id", "ores_topics") \
    .rdd.flatMap(lambda x: [tuple([x[0]] + x[1])]).toDF(topic_schema)

pages_with_topics = pages_with_topics.join(ores_topics, on="page_id", how="left")
# pages_with_topics.select("page_id", "page_title", "ores_topic1", "ores_topic2").show()

In [8]:
pages_with_topics.write.format("parquet").mode("overwrite") \
    .partitionBy("ores_topic1").save(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page-ores-topics.sql.parquet")

HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=revisions&pageids=20443741&format=json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb1935b1bb0>: Failed to establish a new connection: [Errno 113] No route to host'))
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=revisions&pageids=11966763&format=json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb1935b11f0>: Failed to establish a new connection: [Errno 113] No route to host'))
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=revisions&pageids=13830361&format=json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb1935b1bb0>: Failed to establish a new connection: [Errno 113] No route to host'))
HTTPSConnectionPool(host='en.wikipedia.org'

KeyboardInterrupt: 

HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=revisions&pageids=20443805&format=json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb193569580>: Failed to establish a new connection: [Errno 113] No route to host'))
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=revisions&pageids=23550036&format=json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb193575820>: Failed to establish a new connection: [Errno 113] No route to host'))
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=revisions&pageids=11966850&format=json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fb1935740a0>: Failed to establish a new connection: [Errno 113] No route to host'))
HTTPSConnectionPool(host='en.wikipedia.org'