In [1]:
%load_ext autoreload
%autoreload 2
import traceback
import pyspark
import datetime
import csv
import time
import gzip
import gc
import sys
import multiprocessing
import pandas as pd
from pprint import pprint
from functools import partial
from pathlib import Path
from typing import Tuple, Optional, List
import lsde2021.utils as utils
import lsde2021.csv as csvutils
import lsde2021.download as dl
import lsde2021.aggregate as agg
from lsde2021.types import PathLike
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType
import pyspark.sql.functions as F

In [2]:
MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("parse-wikipedia-sql-dumps") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

csv_loader = spark.read.format("csv").options(header='True', inferSchema='True')

21/10/21 02:53:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/21 02:53:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
def parse_wikipedia_sql_dump(
    sql_input_path: PathLike,
    columns: List[str],
    dest: Optional[PathLike] = None,
    encoding: str = "utf-8",
    nrows: Optional[int] = None,
    log: bool = False,
    write: bool = True,
    force: bool = False
) -> PathLike:
    start = time.time()
    csv_config = dict(
        delimiter=',',
        doublequote=False,
        escapechar='\\',
        quotechar="'",
        strict=True,
    )

    input_path = Path(sql_input_path)
    output_path = input_path.with_suffix(".csv")
    if dest is not None:
        output_path = dest / output_path.name
    
    if not force and output_path.exists():
        print(f"using existing {output_path} ...")
        return output_path

    with utils.fopen(input_path, mode="rt", encoding=encoding, errors="ignore") as input_file:
        output_file = None
        if write:
            output_file = open(output_path, mode="w", encoding=encoding)
        entries = 0
        try:
            writer = None
            if output_file:
                writer = csv.writer(output_file, quoting=csv.QUOTE_MINIMAL)
            if writer:
                # write the header
                writer.writerow(columns)
            
            for line in input_file.readlines():
                # Look for INSERT statement and parse it
                if line.startswith('INSERT INTO'):
                    values = line.partition('` VALUES ')[2]
                    assert values
                    assert values[0] == '('
                    # pprint(values[:100])
                    # print("found", values[values.find("79988")-10:][:100])
                    reader = csv.reader([values], **csv_config)
                    for ridx, row in enumerate(reader):
                        for cidx in range(0, len(row), len(columns)):
                            entry = ",".join(row[cidx:cidx+len(columns)])[1:-1]
                            try:
                                entry = tuple(*csv.reader([entry],  delimiter=','))
                                if log:
                                    pprint(entry)
                                    sys.stdout.flush()
                                if writer:
                                    writer.writerow(entry)
                                entries += 1
                            except Exception as e:
                                print("entry:", entry)
                                print("context:", row[cidx:cidx+10])
                                raise e
                            if nrows is not None and entries >= nrows:
                                return output_path
                        gc.collect()
        finally:
            if output_file:
                output_file.close()
                print(f"wrote {entries} rows to {output_path} in {time.time() - start:.2f} seconds ...")
    return output_path

In [None]:
with utils.fopen("../hdd/wikipedia_sql_dumps/dewiki/20211001/dewiki-20211001-categorylinks.sql.gz") as f:
    print(str(f.read(100)))

In [None]:
print(utils.detect_encoding(
    "../hdd/wikipedia_sql_dumps/dewiki/20211001/dewiki-20211001-categorylinks.sql.gz"))

In [None]:
wiki = "enwiki"
for table, cols in [
    ("page", csvutils.PAGE_COLUMNS),
    # ("categorylinks", csvutils.CATEGORYLINKS_COLUMNS)
]:
    parse_wikipedia_sql_dump(
        f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-{table}.sql.gz",
        columns=cols,
        log=True,
        nrows=10,
        force=True,
        write=False,
    )

In [3]:
dest = Path("../hdd/wikipedia_sql_dumps")
tables = ["langlinks", "page", "category", "categorylinks"]

languages = pd.read_csv("./data/languages.csv", index_col="code")
languages.head()

Unnamed: 0_level_0,name,dbname,group
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ar,Arabic,arwiki,ar
ary,Moroccan Arabic,arywiki,ar
arz,Egyptian Arabic,arzwiki,ar
az,Azerbaijani,azwiki,az
azb,South Azerbaijani,azbwiki,az


In [None]:
downloads = list([
    (table, dest / "/".join(dl.wikimedia_sql_dump_local_file(date, wiki=wiki, table=table)))
    for (date, wiki, table), _ in dl.wikimedia_sql_dump_urls(
        [datetime.date(2021, 10, 1)], wikis=languages["dbname"], tables=tables
    )
])
pprint(downloads[:3])

print("downloaded %d of %d" % (
    len(list(dest.rglob("**/*sql.gz"))), len(downloads)
))

In [None]:
table_csv_columns = dict(
    page=csvutils.PAGE_COLUMNS,
    category=csvutils.CATEGORY_COLUMNS,
    categorylinks=csvutils.CATEGORYLINKS_COLUMNS,
    langlinks=csvutils.LANGLINKS_COLUMNS,
)

def parse_wikipedia_sql_dump_handler(item: Tuple[str, PathLike], force: bool = False) -> PathLike:
    table, sql_dump_file = item
    csv_columns = table_csv_columns[table]
    return parse_wikipedia_sql_dump(
        sql_dump_file,
        columns=csv_columns,
        force=force,
    )

parsed = sc.parallelize(downloads, numSlices=multiprocessing.cpu_count()).map(
    partial(
        parse_wikipedia_sql_dump_handler,
        force=False,
    )
).collect()

In [None]:
# check that the parsed files are sane
wiki = "enwiki"
pages = csv_loader.load(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-page.sql.csv"
).limit(10_000)
pages.toPandas().head()

In [None]:
categories = csv_loader.load(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-category.sql.csv"
).limit(10_000)
print(categories.count(), "rows")
categories.toPandas().head()

In [None]:
categorylinks = csv_loader.load(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-categorylinks.sql.csv"
).limit(10_000)
categorylinks.toPandas().head()

In [None]:
langlinks = csv_loader.load(
    f"../hdd/wikipedia_sql_dumps/{wiki}/20211001/{wiki}-20211001-langlinks.sql.csv"
).limit(10_000)
langlinks.toPandas().head()

In [19]:
#### Parse CSV files and convert to correct data dypes using chunking
wikis = languages["dbname"]
wikis = [w for w in wikis if w != "enwiki"]
assert "enwiki" not in wikis
# wikis = ["enwiki"]
parsed = list([
    (table, (dest / "/".join(dl.wikimedia_sql_dump_local_file(date, wiki=wiki, table=table))).with_suffix(".csv"))
    for (date, wiki, table), _ in dl.wikimedia_sql_dump_urls(
        [datetime.date(2021, 10, 1)], wikis=wikis, tables=tables
    )
])
pprint(parsed[:5])

[('langlinks',
  PosixPath('../hdd/wikipedia_sql_dumps/arwiki/20211001/arwiki-20211001-langlinks.sql.csv')),
 ('page',
  PosixPath('../hdd/wikipedia_sql_dumps/arwiki/20211001/arwiki-20211001-page.sql.csv')),
 ('category',
  PosixPath('../hdd/wikipedia_sql_dumps/arwiki/20211001/arwiki-20211001-category.sql.csv')),
 ('categorylinks',
  PosixPath('../hdd/wikipedia_sql_dumps/arwiki/20211001/arwiki-20211001-categorylinks.sql.csv')),
 ('langlinks',
  PosixPath('../hdd/wikipedia_sql_dumps/arywiki/20211001/arywiki-20211001-langlinks.sql.csv'))]


In [None]:
def try_to_int(val):
    try:
        return int(val)
    except ValueError:
        return np.nan
    
def try_to_float(val):
    try:
        return float(val)
    except ValueError:
        return np.nan

In [47]:
def prepare_page_sql_dump(parsed_dump_file: PathLike, output_path: PathLike, chunksize: int = 10 ** 4, max_chunks: Optional[int] = None) -> pd.DataFrame:
    header = True
    with csvutils.read_page_csv(
        parsed_dump_file,
        engine="c",
        dtype=csvutils.RAW_PAGE_DTYPE,
        chunksize=chunksize,
    ) as reader:
        for n, chunk in enumerate(reader):
            # print("before")
            # print(chunk.head())
            chunk["page_id"] = pd.to_numeric(chunk["page_id"], errors='coerce', downcast="unsigned")
            chunk["page_namespace"] = pd.to_numeric(chunk["page_namespace"], errors='coerce', downcast="unsigned")
            chunk["page_title"] = chunk["page_title"].astype("string")
            chunk["page_restrictions"] = chunk["page_restrictions"].astype("string")

            chunk["page_is_redirect"] = chunk["page_is_redirect"].astype("bool")
            chunk["page_is_new"] = chunk["page_is_new"].astype("bool")

            chunk["page_random"] = pd.to_numeric(chunk["page_random"], errors='coerce', downcast="unsigned")
            chunk["page_touched"] = pd.to_datetime(chunk["page_touched"], errors='coerce')
            chunk["page_links_updated"] = pd.to_datetime(chunk["page_links_updated"], errors='coerce')

            chunk["page_len"] = pd.to_numeric(chunk["page_len"], errors='coerce', downcast="unsigned")
            chunk["page_content_model"] = chunk["page_content_model"].astype("category")
            chunk["page_lang"] = chunk["page_lang"].astype("category")
            # print("after")
            # print(chunk.head())
            
            chunk.to_csv(output_path, header=header, mode='a')
            header = False
            if max_chunks is not None and n >= max_chunks:
                break
    return output_path

In [48]:
def prepare_categorylinks_sql_dump(parsed_dump_file: PathLike, output_path: PathLike, chunksize: int = 10 ** 2, max_chunks: Optional[int] = None) -> pd.DataFrame:
    header = True
    with csvutils.read_categorylinks_csv(
        parsed_dump_file,
        engine="c",
        dtype=csvutils.RAW_CATEGORYLINKS_DTYPE,
        chunksize=chunksize,
    ) as reader:
        for n, chunk in enumerate(reader):
            # print("before")
            # print(chunk.head())
            chunk["page_id"] = pd.to_numeric(chunk["page_id"].astype("string"), errors='coerce', downcast="unsigned")
            chunk["category_name"] = chunk["category_name"].astype("string")
            chunk["sortkey"] = chunk["sortkey"].astype("category")
            chunk["timestamp"] = pd.to_datetime(chunk["timestamp"], errors='coerce')
            chunk["sortkey_prefix"] = chunk["sortkey_prefix"].astype("category")
            chunk["collation"] = chunk["collation"].astype("category")
            chunk["type"] = chunk["type"].astype("category")
            # print("after")
            # print(chunk.head())
            chunk.to_csv(output_path, header=header, mode='a')
            header = False
            if max_chunks is not None and n >= max_chunks:
                break
    return output_path

In [49]:
def prepare_category_sql_dump(parsed_dump_file: PathLike, output_path: PathLike, chunksize: int = 10 ** 4, max_chunks: Optional[int] = None) -> pd.DataFrame:
    header = True
    with csvutils.read_category_csv(
        parsed_dump_file,
        engine="c",
        dtype=csvutils.RAW_CATEGORY_DTYPE,
        chunksize=chunksize,
    ) as reader:
        for n, chunk in enumerate(reader):
            # print("before")
            # print(chunk.head())
            chunk["cat_id"] = pd.to_numeric(chunk["cat_id"], errors='coerce', downcast="unsigned")
            chunk["cat_title"] = chunk["cat_title"].astype("string")
            chunk["cat_pages"] = pd.to_numeric(chunk["cat_pages"], errors='coerce', downcast="unsigned")
            chunk["cat_subcats"] = pd.to_numeric(chunk["cat_subcats"], errors='coerce', downcast="unsigned")
            chunk["cat_files"] = pd.to_numeric(chunk["cat_files"], errors='coerce', downcast="unsigned")
            # print("after")
            # print(chunk.head())
            
            chunk.to_csv(output_path, header=header, mode='a')
            header = False
            if max_chunks is not None and n >= max_chunks:
                break
    return output_path

In [50]:
def prepare_langlinks_sql_dump(parsed_dump_file: PathLike, output_path: PathLike, chunksize: int = 10 ** 4, max_chunks: Optional[int] = None) -> pd.DataFrame:
    header = True
    with csvutils.read_langlinks_csv(
        parsed_dump_file,
        engine="c",
        dtype=csvutils.RAW_LANGLINKS_DTYPE,
        chunksize=chunksize,
    ) as reader:
        for n, chunk in enumerate(reader):
            # print("before")
            # print(chunk.head())
            chunk["page_id"] = pd.to_numeric(chunk["page_id"], errors='coerce', downcast="unsigned")
            chunk["lang"] = chunk["lang"].astype("category")
            chunk["lang_title"] = chunk["lang_title"].astype("string")
            # print("after")
            # print(chunk.head())
            
            chunk.to_csv(output_path, header=header, mode='a')
            header = False
            if max_chunks is not None and n >= max_chunks:
                break
    return output_path

In [51]:
%reload_ext autoreload
%autoreload 2
%aimport lsde2021.utils
%aimport lsde2021.csv
import lsde2021.utils as utils
import lsde2021.csv as csvutils

def prepare_wikipedia_sql_dump_handler(item: Tuple[str, PathLike], force: bool = False, max_chunks: Optional[int] = None) -> PathLike:
    table, parsed_sql_dump_file = item
    output_path = utils.strip_extension(parsed_sql_dump_file)
    output_path = (output_path.parent / (output_path.stem + "-converted")).with_suffix(".sql.csv")
    print(table, parsed_sql_dump_file, output_path)
    
    if not force and output_path.exists():
        print(f"using existing {output_path} ...")
        return output_path
    
    # since we parse in chunks and append to the file, we have to remove it first
    output_path.unlink(missing_ok=True)
    print(f"creating {output_path} ...")
    
    start = time.time()
    if table == "page":
        prepare_page_sql_dump(parsed_sql_dump_file, output_path, max_chunks=max_chunks)
    if table == "categorylinks":
        prepare_categorylinks_sql_dump(parsed_sql_dump_file, output_path, max_chunks=max_chunks)
    if table == "category":
        prepare_category_sql_dump(parsed_sql_dump_file, output_path, max_chunks=max_chunks)
    if table == "langlinks":
        prepare_langlinks_sql_dump(parsed_sql_dump_file, output_path, max_chunks=max_chunks)
    
    print(f"created {output_path} in {time.time() - start:.2f} seconds ...")
    return output_path

parallel = multiprocessing.cpu_count()
converted = sc.parallelize(parsed, numSlices=parallel).map(
    partial(
        prepare_wikipedia_sql_dump_handler,
        force=False,
        # max_chunks=10,
    )
).collect()

langlinkslanglinkslanglinkslanglinks   ../hdd/wikipedia_sql_dumps/zhwiki/20211001/zhwiki-20211001-langlinks.sql.csv../hdd/wikipedia_sql_dumps/kuwiki/20211001/kuwiki-20211001-langlinks.sql.csv  ../hdd/wikipedia_sql_dumps/frwiki/20211001/frwiki-20211001-langlinks.sql.csv../hdd/wikipedia_sql_dumps/zhwiki/20211001/zhwiki-20211001-langlinks-converted.sql.csv ../hdd/wikipedia_sql_dumps/kuwiki/20211001/kuwiki-20211001-langlinks-converted.sql.csv

../hdd/wikipedia_sql_dumps/frwiki/20211001/frwiki-20211001-langlinks-converted.sql.csv
langlinks ../hdd/wikipedia_sql_dumps/bewiki/20211001/bewiki-20211001-langlinks.sql.csv ../hdd/wikipedia_sql_dumps/bewiki/20211001/bewiki-20211001-langlinks-converted.sql.csv
using existing ../hdd/wikipedia_sql_dumps/frwiki/20211001/frwiki-20211001-langlinks-converted.sql.csv ...using existing ../hdd/wikipedia_sql_dumps/kuwiki/20211001/kuwiki-20211001-langlinks-converted.sql.csv ...using existing ../hdd/wikipedia_sql_dumps/zhwiki/20211001/zhwiki-20211001-langlinks-c

In [10]:
# check out the namespaces in more detail
all_namespaces = pages.select("page_namespace").distinct()
all_namespaces.show()
print(all_namespaces.count(), "page_namespace")

NameError: name 'pages' is not defined

In [None]:
csv_loader.load()

In [None]:
# find the german wikipedia artices with no translation entry
german_pages = csv_loader.load(
    f"../hdd/wikipedia_sql_dumps/dewiki/20211001/dewiki-20211001-page.sql.csv"
).limit(10_000)

page_langlinks = pages.join(
    langlinks.select([
        F.col("lang").alias("id"),
        F.col("lang_title").alias("translation_lang"),
        F.col("page_id").alias("english_page_id")
    ]), on="id", how="outer")
    # (F.col("id") == F.col("from")), how="outer")
page_langlinks.show()


without_translation = page_langlinks.filter(F.col("translation_lang").isNull())

print("total:", page_langlinks.count())
print("without translation:", without_translation.count())

In [None]:
for date, f in tqdm(list(zip(pageview_samples, pageview_sample_files))):
    df = pd.read_csv(
        f,
        sep=' ',
        names=pageview_columns,
        low_memory=False,
        dtype=pageview_dtype,
        on_bad_lines="skip",
        # nrows=1_000_000,
        usecols=["wiki_code", "page_id", "daily_total"], # "page_title"
        engine="c",
    )
    # df = df.compute()
    
    # parse and reduce 
    df = df[df["wiki_code"].notna()]
    df["wiki_code"] = df["wiki_code"].apply(lambda c: c.split(".")[0])
    df["page_id"] = pd.to_numeric(df["page_id"], errors='coerce')
    df = df[df["page_id"].notna()]
    df["page_id"] = df["page_id"].astype("int32")
    df["daily_total"] = pd.to_numeric(df["daily_total"], errors='coerce')
    df["daily_total"] = df["daily_total"].fillna(0).astype("Int32")
    df = df.rename(columns={
        "daily_total": f"{date.year}{str(date.month).zfill(2)}{str(date.day).zfill(2)}_daily_total",
    })
    df = df.set_index(index_cols)
    df = df.groupby(index_cols).sum()
    if dfs is None:
        dfs = df
    else:
        dfs = dfs.join(df, on=index_cols, how="outer") # "page_title"

    del df
    gc.collect()
    
    # print(dfs.dtypes)
    print(dfs.shape)

dfs.head()

In [None]:
page_langlinks.filter(F.col("title") == "Pendel").show()
print(page_langlinks.filter(F.col("title") == "Pendel").count())
# Pendel

In [None]:
# copy the csv files to the nvme
# join all countries' wiki langlinks with the pageviews so that we know the english titles and can drop the 
# to do this, we have to join with the pages first?

In [None]:
# join the pages with their title in english and their english category
# drop all the pages that have no english article

In [None]:
#daily = df \
#    .select("domain_code", "page_title", F.col("view_count").alias("view_count2")) \
#    .join(daily, on=["domain_code", "page_title"], how="outer") \
#    .fillna(value=0)
#daily = daily \
#    .withColumn('view_count_sum', sum([daily["view_count"], daily["view_count2"]])) \
#    .select("domain_code", "page_title", F.col("view_count_sum").alias("view_count"))