In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import bz2
import csv
import io
import json
import random
import requests
from pathlib import Path
from pprint import pprint
from typing import List, Dict
import ores.api
from ores.utilities import score_revisions
import lsde2021.csv as csvutil
import lsde2021.utils as utils
import lsde2021.download as dl
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import pyspark.sql.functions as F

In [None]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

In [None]:
# join categories with english wiki page table
wiki = "enwiki"
pages = parquet_reader.load(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page.sql.parquet")
categorylinks = parquet_reader.load(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-categorylinks.sql.parquet")

In [None]:
pages.limit(10).show()
categorylinks.limit(10).show()

In [None]:
categorylinks_array = categorylinks \
    .groupBy('page_id') \
    .agg(F.collect_list('category_name').alias('category_names'))\

pages_with_categories = pages \
    .select("page_is_redirect", "page_id", "page_namespace", "page_title")

pages_with_categories = pages_with_categories \
    .join(categorylinks_array, on="page_id", how="left")

In [None]:
pages_with_categories.write.format("parquet").mode("overwrite").save(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page-low-level-categories.sql.parquet")

In [None]:
# sc.stop()