In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import bz2
import csv
import io
import re
import time
import json
import random
import requests
from tqdm import tqdm
import multiprocessing
import concurrent.futures
import pickle as pkl
import numpy as np
import networkx as nx
from functools import partial, reduce
from collections import Counter
from pathlib import Path
from pprint import pprint
from typing import List, Dict
import matplotlib.pyplot as plt
import lsde2021.csv as csvutil
import lsde2021.utils as utils
from lsde2021.lang import singularize, pluralize
import lsde2021.download as dl
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [None]:
MAX_MEMORY = "10G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')
parquet_reader = spark.read.format("parquet").options(inferSchema='True')

In [None]:
# join categories with english wiki page table
wiki = "enwiki"
raw_pages = parquet_reader.load(str(f"../nvme/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page-category-count.sql.parquet"))

In [None]:
# first have a closer look at some of the categories and how they look like so we can split them eventually
example_categories = raw_pages.select("category_name").limit(1_000).rdd.flatMap(lambda x: x).collect()

In [None]:
pprint(example_categories[0:100])

In [None]:
graph = nx.read_gpickle(f"../nvme/en-category-tree.pkl")

In [None]:
# filter out all hidden categories by removing nodes that have an edge to the hidden category
hidden_category = raw_pages \
    .filter((F.col("category_name") == "Hidden_categories") & (F.col("page_namespace") == 14))
hidden_category.limit(100).show()

hidden_category = hidden_category \
    .groupBy("category_page_id") \
    .count()
hidden_category.show()

In [None]:
hidden_category_node = 15961454

hidden_sub_categories = list(nx.bfs_tree(graph, reverse=True, source=hidden_category_node, depth_limit=1))

# 30259 for depth_limit=1
# 6_838_612 for depth_limit=2
print(len(hidden_sub_categories))
pprint([graph.nodes[n] for n in hidden_sub_categories[-25:]])

In [None]:
# remove hidden topics and their edges from the graph
print("edges before: %d | nodes before: %d" % (len(graph.edges), len(graph.nodes)))
graph.remove_nodes_from(hidden_sub_categories)
print("edges after: %d | nodes after: %d" % (len(graph.edges), len(graph.nodes)))

# edges before: 74832061 | nodes before: 7987708
# edges after: 38634343 | nodes after: 7957449

In [None]:
print("edges reduced", 38634343/74832061)
print("nodes reduced", 7957449/7987708)

In [None]:
# save the graph for reuse
# nx.write_gpickle(graph, f"../nvme/en-category-tree-without-hidden.pkl")

In [None]:
# Example case: find the COVID 19 wikipedia article
covid_article = raw_pages.filter(F.col("page_title") == "COVID-19").limit(100)
covid_article.show()

In [None]:
# find the content category
root_category = raw_pages.filter((F.col("category_name") == "Content") & (F.col("page_namespace") == 14)).limit(100)
root_category.show()

In [None]:
# find sinks in the graph (there should only be one)
sinks = [node for node in graph.nodes if graph.out_degree(node) == 0 and graph.in_degree(node) > 0]
print(len(sinks))

In [None]:
pprint([graph.nodes[n]["title"] for n in sinks[:20]])

In [None]:
# get the average node degree of the graph
leafs = [node for node in graph.nodes if graph.in_degree(node) < 1]
inners = [node for node in graph.nodes if graph.in_degree(node) > 0]

def avg_out_degree(nodes):
    sum_of_edges = sum([graph.out_degree(node) for node in nodes])
    return sum_of_edges / len(nodes)

print("average node degree of leafs", avg_out_degree(leafs))
print("average node degree of inners", avg_out_degree(inners))

In [None]:
numeric = re.compile(r'^([\s\d]+)$')

patterns = [
    (re.compile(r"^\d+th-century_(\w+)_in_the_(\w+)$"), []),
    (re.compile(r"^\d+th-century_(\w+)_in_(\w+)$"), []),
    
    (re.compile(r"^\d+s_in_the_(\w+)$"), []),
    (re.compile(r"^\d+s_in_(\w+)$"), []),
    (re.compile(r"^\d+_in_the_(\w+)$"), []),
    (re.compile(r"^\d+_in_(\w+)$"), []),
    
    (re.compile(r"^(\w+)_based_in_(\w+)_by_subject$"), []),
    
    (re.compile(r"^(\w+)_established_in_the_(\w+)$"), []),
    (re.compile(r"^(\w+)_established_in_(\w+)$"), []),
    
    (re.compile(r"^(\w+)_in_the_(\w+)$"), []),
    (re.compile(r"^(\w+)_in_(\w+)$"), []),
    
    (re.compile(r"^(\w+)_and_the_(\w+)$"), []),
    (re.compile(r"^(\w+)_and_(\w+)$"), []),
    
    (re.compile(r"^(\w+)_of_the_(\w+)_by_country$"), []),
    (re.compile(r"^(\w+)_of_(\w+)_by_country$"), []),
    (re.compile(r"^(\w+)_of_the_(\w+)$"), []),
    (re.compile(r"^(\w+)_of_(\w+)$"), []),
    
    (re.compile(r"^(\w+)_by_country$"), []),
    (re.compile(r"^(\w+)_by_region$"), []),
    (re.compile(r"^(\w+)_by_location$"), []),
    (re.compile(r"^(\w+)_by_field$"), []),
    (re.compile(r"^(\w+)_by_location$"), []),
    (re.compile(r"^(\w+)_by_type$"), []),
    
    (re.compile(r"^\d+_(\w+)_by_legal_status$"), []),
    (re.compile(r"^\d+_(\w+)_by_year$"), []),
    (re.compile(r"^\d+_(\w+)_by_date$"), []),
    (re.compile(r"^\d+_(\w+)_by_year_and_country$"), []),
    (re.compile(r"^\d+_(\w+)_by_country_and_year$"), []),
    (re.compile(r"^\d+_(\w+)_by_country$"), []),
    (re.compile(r"^\d+_(\w+)_by_continent$"), []),
    (re.compile(r"^\d+_(\w+)_by_decade$"), []),
    (re.compile(r"^\d+_(\w+)_by_date$"), []),
    (re.compile(r"^\d+_(\w+)_by_(\w+)$"), []),
    
    
    (re.compile(r"^(\w+)_by_legal_status$"), []),
    (re.compile(r"^(\w+)_by_year$"), []),
    (re.compile(r"^(\w+)_by_date$"), []),
    (re.compile(r"^(\w+)_by_year_and_country$"), []),
    (re.compile(r"^(\w+)_by_country_and_year$"), []),
    (re.compile(r"^(\w+)_by_country$"), []),
    (re.compile(r"^(\w+)_by_continent$"), []),
    (re.compile(r"^(\w+)_by_decade$"), []),
    (re.compile(r"^(\w+)_by_date$"), []),
    (re.compile(r"^(\w+)_by_(\w+)$"), []),
    
    (re.compile(r"^\d+_(\w+)$"), []),
]
print(len(patterns))

test_str = 'Companies_by_date'
for pattern, extra_words in patterns:
    match = pattern.fullmatch(test_str)
    if match:
        print(list(match.groups()))
        break

In [None]:
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']

EXCLUDE = set(stopwords).union({"by","or","and","with","the","of","in","without","a","on"})
print(len(EXCLUDE))

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def unique(l, key):
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (key(x) in seen or seen_add(key(x)))]

def union(dfs):
    return reduce(DataFrame.unionAll, dfs)

def is_uppercase(s: str):
    return s[0].isupper()

def split_by_pattern(s: str) -> List[str]:
    for pattern, extra_words in patterns:
        match = re.fullmatch(pattern, s)
        if match:
            return list(match.groups()), True
    return [s], False

def split(s: str, split_unmatched=False, singularize=False, pluralize=False, recursive=False):
    # first, test for common patterns
    splitted, matched = split_by_pattern(s)
    
    # split recursively
    rec_splitted = flatten([split_by_pattern(ss)[0] for ss in splitted])
    while recursive and set(splitted) != set(rec_splitted):
        splitted = rec_splitted[:]
        rec_splitted = flatten([split_by_pattern(ss)[0] for ss in splitted])
            
    if not matched:
        if split_unmatched:
            # if no pattern is found, split and remove stopwords
            splitted += re.split(' |,|_', s)
        else:
            splitted = [s]
    
    splitted = set([sp.replace("_", " ") for sp in splitted if numeric.match(sp) is None])
    
    if singularize and pluralize:
        splitted = set([singularize(sp) for sp in splitted]).union(set([pluralize(sp) for sp in splitted]))
    elif singularize:
        splitted = set([singularize(sp) for sp in splitted])
    elif pluralize:
        splitted = set([pluralize(sp) for sp in splitted])
    splitted = splitted - EXCLUDE
    return splitted

def split_all(s: str):
    return split(s, split_unmatched=True)

def bfs_tree(g, node, depth_limit=None):
    ans = []
    visited = set()
    level = [(node, 0)]
    while len(level) > 0:
        for v, depth in level:
            ans.append((v, depth))
            visited.add(v)
        next_level = set()
        for v, depth in level:
            for w in g.neighbors(v):
                if w not in visited:
                    next_level.add((w, depth + 1))
        level = next_level
    return ans

def freq_bfs_tree(g, node, depth_limit=None):
    ans = []
    counts = dict()
    visited = set()
    level = [(node, 0)]
    while len(level) > 0:
        for v, depth in level:
            ans.append((v, depth))
            visited.add(v)
            counts[v] = 1
        next_level = set()
        for v, depth in level:
            for w in g.neighbors(v):
                if w in visited:
                    counts[v] += 1
                elif depth_limit is None or depth + 1 <= depth_limit:
                    next_level.add((w, depth + 1))
        level = next_level
    
    levels = dict()
    for n, depth in ans:
        if depth not in levels:
            levels[depth] = []
        levels[depth].append((n, counts[n]))
    
    levels = {depth: sorted(nodes, key=lambda x: x[1], reverse=True) for depth, nodes in levels.items()}
    return levels

In [None]:
def find_topics(node, g, depth_limit: int = 4, max_categories: int = 5) -> Dict[int, List[int]]:
    categories = freq_bfs_tree(g, node, depth_limit=depth_limit)
    if False:
        pprint({
            depth: [(g.nodes[n]["title"], n, count) for n, count in nodes]
            for depth, nodes in categories.items() if depth > 0
        })

    return {
        depth: unique(flatten([
            [w.capitalize() for w in split(g.nodes[n]["title"], recursive=True)]
            for n, count in nodes
        ]), key=lambda x: x[0])[:max_categories] for depth, nodes in categories.items() if depth > 0
    }

In [None]:
%%time
depth_limit = 4
n_categories = 5
page_id = 63030231 # covid 19
# page_id = 11867 # germany
# page_id = 24365 # porsche

pprint(find_topics(page_id, g=graph, depth_limit=4, max_categories=5))

In [None]:
all_page_ids =  list(enumerate(sorted(raw_pages.filter(F.col("page_namespace") == 0).select("page_id").distinct().rdd.flatMap(lambda x: x).collect())))
print(len(all_page_ids))
with open("../nvme/en_topics/all_page_ids.pkl", 'wb') as f:
    pkl.dump(all_page_ids, f, protocol=pkl.HIGHEST_PROTOCOL)