In [1]:
%load_ext autoreload
%autoreload 2
import pyspark
import lsde2021.download as dl
from lsde2021.types import PathLike
from pyspark.sql import SparkSession
from functools import partial
from pathlib import Path
import re
import hashlib
from pprint import pprint
import datetime
import logging
import logging.config
import requests
from io import StringIO, BytesIO
import pandas as pd
from bs4 import BeautifulSoup
from typing import Union, List, Tuple, Optional

In [2]:
WIKI_DL_URL = 'https://dumps.wikimedia.org'

logger = logging.getLogger(__name__)

MAX_MEMORY = "60G"

spark = SparkSession \
    .builder \
    .appName("EDA") \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .config('spark.driver.maxResultSize', MAX_MEMORY) \
    .config('spark.ui.showConsoleProgress', 'false') \
    .getOrCreate()
sc = spark.sparkContext

21/10/05 22:55:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
def get_wikipedia_dump_url(lang: str, date: str) -> str:
    return '{}/{}wiki/{}'.format(WIKI_DL_URL, lang, date)

def get_wikipedia_md5_checksum_url(lang: str, date: str) -> str:
    return '{}/{}wiki/{}/{}wiki-{}-md5sums.txt'.format(WIKI_DL_URL, lang, date, lang, date)

def get_wikipedia_multi_pattern(lang: str, date: str) -> str:
    return r'^.*({}wiki-{}-pages-articles[0-9]+.xml.*bz2$)'.format(lang, date)

def get_wikipedia_single_pattern(lang: str, date: str) -> str:
    return r'^.*({}wiki-{}-pages-articles+.xml.*bz2$)'.format(lang, date)

def get_wiki_archive_url(lang: str, date: str, href: str) -> str:
    if Path(href).is_absolute():
        return '{}/{}'.format(WIKI_DL_URL, href)
    wiki_dump_url = get_wikipedia_dump_url(lang, date)
    return '{}/{}'.format(wiki_dump_url, href)

In [4]:
def md5_checksum(path: PathLike) -> str:
    hash_md5 = hashlib.md5()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

In [5]:
def collect_wiki_arxiv_hrefs(lang: str, date: str) -> List[Tuple[str, str]]:
    wiki_dump_url = get_wikipedia_dump_url(lang, date)
    wiki_dump_checksum_url = get_wikipedia_md5_checksum_url(lang, date)
    wiki_archive_hrefs = []
    try:
        logger.info('collecting archive from {}'.format(wiki_dump_url))
        
        query_response = requests.get(wiki_dump_url, allow_redirects=True)
        checksum_response = requests.get(wiki_dump_checksum_url, allow_redirects=True)
        # checksum_response.content
        # checksums_file = StringIO(str(checksum_response.content))
        checksums_file = BytesIO(checksum_response.content)
        checksums = pd.read_csv(
            checksums_file,
            header=None,
            names=["checksum", "url"],
            index_col="url",
            delimiter=r"\s+"
        )
        # valid_checksum_hrefs = checksums.index.tolist()
        # pprint(valid_checksum_hrefs)
        # assert "dewiki-20211001-site_stats.sql.gz" in checksums.index.tolist()
        # print(checksums.loc["dewiki-20211001-site_stats.sql.gz"])
        
        soup = BeautifulSoup(query_response.content, 'html.parser')
        for link in soup.find_all('a'):
            # we want multi
            # print(link)
            pattern = get_wikipedia_multi_pattern(lang, date)
            href = link.get('href')
            # print(pattern, href)
            if re.match(pattern, href):
                # print("match")
                # href_checksum = None
                # if Path(href).name in valid_checksum_hrefs:
                #     href_checksum = checksums.loc[Path(href).name]
                # print(checksums.loc[Path(href).name]["checksum"])
                wiki_archive_hrefs.append((checksums.loc[Path(href).name]["checksum"], href))
            # print("no match")
        if len(wiki_archive_hrefs) < 1:
            logger.info('no multi archives found. trying for single archive ...')
            # if archive is too small, check for single arxiv
            for link in soup.find_all('a'):
                pattern = get_wikipedia_single_pattern(lang, date)
                href = link.get('href')
                if re.match(pattern, href):
                    wiki_archive_hrefs.append((checksums.loc[Path(href).name]["checksum"], href))
        if not wiki_archive_hrefs:
            logger.warning('no wikipedia archive found')
    except requests.exceptions.HTTPError as e:
        logger.error('HTTP error ({}) using lang = \'{}\' and date = \'{}\'. '
                     'could not retrieve any wikipedia data at {}'
                     .format(e.response.status_code, lang, date, wiki_dump_url))
        raise e
    return wiki_archive_hrefs

In [10]:
def download_wikipedia(
    sc: pyspark.sql.session.SparkSession,
    lang: str,
    date: Union[datetime.date, str],
    dest: str,
    force: bool = False,
    max_concurrency: int = 4,
) -> None:
    if isinstance(date, datetime.date):
        _date = "%04d%02d%02d" % (date.year, date.month, date.day)
    else:
        _date = date
    
    wiki_arxiv_hrefs = collect_wiki_arxiv_hrefs(lang, _date)
    
    def download_handler(wiki_archive: Tuple[str, str]) -> PathLike:
        wiki_archive_checksum, wiki_archive_href = wiki_archive
        wiki_archive_url = get_wiki_archive_url(lang, _date, wiki_archive_href)
        destination = dest / lang / Path(wiki_archive_href).name
        
        # print(wiki_archive_href)
        # check if the file already exists and is not corrupted
        if not force and destination.exists():
            checksum = md5_checksum(destination)
            if checksum == wiki_archive_checksum:
                logger.info('using existing file {}'.format(destination))
                return destination
            
        logger.info('downloading {}'.format(wiki_archive_url))
            
        def validate_file(downloaded_file: PathLike) -> bool:
            checksum = md5_checksum(downloaded_file)
            return checksum == wiki_archive_checksum
        
        return dl.download_file(wiki_archive_url, destination=destination, validate_file_func=validate_file)
    
    downloaded = sc.parallelize(wiki_arxiv_hrefs, numSlices=max_concurrency) \
        .map(partial(download_handler)) \
        .collect()
    return downloaded

In [11]:
dest = Path("./wikimedia_data/wikipedia")
download_wikipedia(sc, "de", datetime.date(2021,10,1), dest=dest)

[PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles1.xml-p1p297012.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles2.xml-p297013p1262093.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles3.xml-p1262094p2762093.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles3.xml-p2762094p3376257.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles4.xml-p3376258p4876257.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles4.xml-p4876258p6115464.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles5.xml-p6115465p7615464.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles5.xml-p7615465p9115464.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles5.xml-p9115465p9261244.bz2'),
 PosixPath('wikimedia_data/wikipedia/de/dewiki-20211001-pages-articles6.xml-p9261245p10761244.bz2'),
 PosixP

In [8]:
# sc.stop()