In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import bz2
import csv
import io
import json
import random
import requests
import numpy as np
import networkx as nx
from pathlib import Path
from pprint import pprint
from typing import List, Dict
import matplotlib.pyplot as plt
import lsde2021.csv as csvutil
import lsde2021.utils as utils
import lsde2021.download as dl
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import pyspark.sql.functions as F

In [2]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/24 13:58:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# join categories with english wiki page table
wiki = "enwiki"
raw_pages = parquet_reader.load(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page.sql.parquet")
raw_categorylinks = parquet_reader.load(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-categorylinks.sql.parquet")

AnalysisException: Path does not exist: file:/home/jovyan/nvme/wikipedia_sql_dumps/enwiki/20211001/enwiki-20211001-page.sql.parquet

In [None]:
raw_pages.limit(10).show()
raw_categorylinks.limit(10).show()

In [None]:
pages = raw_pages \
    .filter((F.col("page_is_redirect") == 0)) \
    .filter((F.col("page_namespace") == 0) | (F.col("page_namespace") == 14)) \
    .select("page_id", "page_namespace", "page_title")

categorylinks = raw_categorylinks \
    .select("page_id", "category_name")

category_pages = pages \
    .filter(F.col("page_namespace") == 14) \
    .select(
        F.col("page_id").alias("category_page_id"),
        F.col("page_title").alias("category_name"),
    )

print(pages.count())

In [None]:
# find the categories of the page
# .limit(100_000) \
page_cats = pages \
    .join(categorylinks, on="page_id", how="inner")

# find the page_id for the categories
page_cats = page_cats \
    .join(category_pages, on="category_name", how="left")

page_cats.limit(10).show()

In [None]:
# count topic popularity by number of pages
duplicate_counts = page_cats \
    .groupby(["page_id"]) \
    .count()

page_cats = page_cats \
    .join(duplicate_counts, on="page_id", how="inner") \
    .sort('count', ascending=False) \

page_cats.limit(10).show()

In [None]:
# save the pages with category
page_cats.write.format("parquet").mode("overwrite").save(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page-category-count.sql.parquet")

In [None]:
%%time
graph = nx.DiGraph()

max_size = None # 100_000
for i, row in enumerate(page_cats.rdd.toLocalIterator()):
    if i % ((max_size or 20_000_000) / 10) == 0:
        print("row", i)
        
    node = row["page_id"]
    node_count = row["count"]
    
    category_node = row["category_page_id"]
    is_category = False
    try:
        is_category = int(row["page_namespace"]) == 14
    except Exception:
        pass
    
    valid_node = node is not None and node is not np.nan
    valid_category_node = category_node is not None and category_node is not np.nan
    # print(node, category_node, is_category)
    
    # add page node
    if valid_node:
        if node not in graph.nodes:
            graph.add_node(node, is_category=is_category, title=row["page_title"], node_count=node_count)
        else:
            graph.update(nodes={
                node: dict(is_category=is_category, title=row["page_title"], node_count=node_count)
            })
    
    # add category node
    if valid_category_node and category_node not in graph.nodes:
        graph.add_node(category_node, is_category=True, title=row["category_name"], node_count=0)
    
    # add the edge between them
    if valid_node and valid_category_node:
        graph.add_edge(node, category_node)
    
    if max_size is not None and i >= max_size:
        break

In [None]:
# save the graph for reuse
nx.write_gpickle(graph, f"../nvme/en-category-tree.pkl")

In [None]:
# save the graph for reuse
nx.write_graphml_lxml(graph, f"../nvme/en-category-tree.graphml")

In [None]:
# first have a closer look at some of the categories and how they look like so we can split them eventually
example_categories = page_cats.select("category_name").limit(1_000).rdd.flatMap(lambda x: x).collect()
pprint(example_categories[0:100])

In [None]:
patterns =
# NUMBER_births -> People
# XX_based_in_PLACE -> Organization
# XX_established_in_PLACE -> Music
# compositions_by_ARTIST -> Music

In [None]:
labels = nx.get_node_attributes(graph, 'title')
# colors = {node: "lightblue" if is_cat else "orange" for node, is_cat in nx.get_node_attributes(graph, 'is_category').items()}
colors = ["lightblue" if is_cat else "orange" for node, is_cat in nx.get_node_attributes(graph, 'is_category').items()]
# print(colors)
# print(labels)

plt.figure(figsize=(12,12)) 
pos = nx.spring_layout(graph)
_ = nx.draw_networkx_edges(graph, pos, alpha=0.2)
_ = nx.draw_networkx_nodes(graph, pos, label=labels, node_size=1000, node_color=colors)
_ = nx.draw_networkx_labels(graph, pos)
# nx.draw(graph, labels=labels, node_size=1000, node_color=colors)
# ["lightblue" if graph.nodes[n]['is_category'] else "orange" for n in graph.nodes])

In [None]:
print(row["page_id"])
    # find the categories of the page
    page_id = row["page_id"]
    page_id = 3306201
    page_cats = regular_pages \
        .filter(F.col("page_id") == page_id) \
        .join(categorylinks, on="page_id", how="inner")
    
    # .find(F.col("page_id") == row["page_id"])
    page_cats.limit(10).show()
    
    # find the category pages for the categories
    # F.col("page_title") == F.col("category_name")
    page_cats = page_cats.join(category_pages, on="category_name", how="left")
    page_cats.limit(10).show()
    break

In [None]:
%%time
test = csvutil.read_pageview_csv(
    "../hdd/pageview_complete/2020/2020-02/pageviews-20200207-user.bz2",
    engine="python",
    skiprows=10_000,
    nrows=10_000,
)
test.head()

In [None]:
wiki = "enwiki"
test = Path(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-categorylinks.sql.csv").resolve().absolute()
df = csv_loader.load(str(test))

In [None]:
# df = df.withColumn("page_id", df["page_id"].cast(IntegerType()))
# df = df.withColumn("page_id", df["page_id"].cast(IntegerType()))
df.limit(100).show()
df.printSchema()
df.select("type").distinct().show()

In [None]:
df.filter(F.col("page_id").isNotNull()).count()

In [None]:
df.count()

In [None]:
%%time
wiki = "enwiki"
categorylinks = csvutil.read_categorylinks_csv(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-categorylinks-converted.sql.csv",
    names=None,
    engine="c",
    header=0,
    # low_memory=True,
    # skiprows=1,
    index_col=False,
    nrows=100_000,
)
# print(categorylinks.shape)
# print(categorylinks.head())
#categorylinks["page_id"] = pd.to_numeric(categorylinks["page_id"], errors='coerce', downcast="unsigned")
#categorylinks["category_name"] = categorylinks["category_name"].astype("string")
#categorylinks["sortkey"] = categorylinks["sortkey"].astype("category")
# categorylinks["timestamp"] = pd.to_datetime(categorylinks["timestamp"], errors='coerce')
#categorylinks["sortkey_prefix"] = categorylinks["sortkey_prefix"].astype("category")
#categorylinks["collation"] = categorylinks["collation"].astype("category")
#categorylinks["type"] = categorylinks["type"].astype("category")
# print(categorylinks.head())
categorylinks.head()

In [None]:
def process_page_chunk(chunk: pd.DataFrame) -> pd.DataFrame:
    # convert the dtypes
    chunk["page_id"] = pd.to_numeric(chunk["page_id"], errors='coerce')
    chunk["page_namespace"] = pd.to_numeric(chunk["page_namespace"], errors='coerce')
    chunk["page_title"] = chunk["page_title"].astype("string")
    chunk["page_restrictions"] = chunk["page_restrictions"].astype("category")

    chunk["page_is_redirect"] = chunk["page_is_redirect"].astype("bool")
    chunk["page_is_new"] = chunk["page_is_new"].astype("bool")

    chunk["page_random"] = pd.to_numeric(chunk["page_random"], errors='coerce')
    chunk["page_touched"] = pd.to_datetime(chunk["page_touched"], errors='coerce')
    chunk["page_links_updated"] = pd.to_datetime(chunk["page_links_updated"], errors='coerce')

    chunk["page_len"] = chunk["page_len"].astype("int32")
    chunk["page_content_model"] = chunk["page_content_model"].astype("category")
    chunk["page_lang"] = chunk["page_lang"].astype("category")
    
    # find the revisions
    page_ids = chunk["page_id"].unique().tolist()
    rev_ids = get_page_rev_ids(page_ids)
    # pprint(rev_ids)

    # find the articletopic
    topics = dict()
    scores = session.score("enwiki", ["articletopic"], revids=rev_ids)
    for (page_id, rev_id), score in zip(rev_ids.items(), scores):
        response = score["articletopic"]
        if "error" not in response:
            topic_probs = response["score"]["probability"]
            topic_probs = sorted(topic_probs.items(), key=lambda t: t[1], reverse=True)
            topic_probs = [t for t, prob in topic_probs if prob > 0.6]
            topics[page_id] = topic_probs[:5] + [None] * 5
            
    # add top 5 articetopics to the chunk dataframe
    for i in range(5):
        # chunk[f"ores_topic_{i}"] = np.nan
        chunk[f"ores_topic_{i+1}"] = chunk["page_id"].apply(lambda pid: topics.get(page_id, [None] * 5)[i])
        
    # add the original wikipedia category
    chunk = pd.merge(chunk, categorylinks, on="page_id", how="left")
    return chunk

In [None]:
%%time
# chunksize = 10 ** 3
chunksize = 50 # the limit for wikipedia api queries is 50
header = True
with csvutil.read_page_csv(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page.sql.csv",
    engine="c",
    low_memory=True,
    chunksize=chunksize,
    # skiprows=0,
    # nrows=10_000_000,
) as reader:
    for chunk in reader:
        processed = process_page_chunk(chunk)
        print(processed.head())
        # os.path.join(folder, new_folder, "new_file_" + filename)
        # chunk.to_csv(, header=header, cols=[['TIME','STUFF']], mode='a')
        header = False
        break
# pages.head()

In [None]:
pages.shape

In [None]:
wiki = "enwiki"
pages = csv_loader.load(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page.sql.csv"
)
# .limit(10_000)
pages = pages.toPandas()
pages.head()

In [None]:
print(pages.shape)

In [None]:
revids=[1050929646] # coffee
scores = session.score("enwiki", ["articletopic"], revids=revids)
for revid, score in zip(revids, scores):
    print(revid)
    topic_probs = score["articletopic"]["score"]["probability"]
    topic_probs = sorted(topic_probs.items(), key=lambda t: t[1])
    print(topic_probs
print(list(scores))

In [None]:
# try out ores
# "" enwiki damaging
json.loads("{\"rev_id\": 456789}")
f = io.StringIO()
out = score_revisions.run(
    ores_host="https://ores.wikimedia.org",
    user_agent="",
    context="enwiki",
    model_names=["damaging"],
    batch_size=1,
    parallel_requests=1,
    retries=10,
    input=['{"rev_id": 456789}'],
    output=f,
    verbose=0)
print(out)
print(f.read())
# \n{"rev_id": 3242342}

In [None]:
wiki = "frwiki"
dialect = csvutil.sniff_csv_dialect(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-categorylinks.sql.csv"
)
pprint(csvutil.inspect_csv_dialect(dialect))
for k, v in csvutil.inspect_csv_dialect(dialect).items():
    print(f"{k} = {repr(v)}")

In [None]:
categorylinks = csvutil.read_categorylinks_csv(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-categorylinks.sql.csv",
    nrows=1_000_000
)
categorylinks.head()

In [None]:
categories = csvutil.read_category_csv(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-category.sql.csv",
    nrows=1_000_000
)
categories.head()