## Imports

In [None]:
import subprocess

In [None]:
download_new_dumps = True # set to False if you have up-to-date dumps

# do you want to extract the dumps? set to True if yes
# set to False if you want to extract the XML dump page-by-page, or already have extracted the dumps
extract_dumps = True 

## Get and process Wikipedia XML dump

In [None]:
# define path to latest dump
LATEST_WP_DUMP_WEB = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
LATEST_WP_DUMP_LOCAL = 'enwiki-latest-pages-articles.xml.bz2'

In [1]:
# get most recent wikipedia dump
if download_new_dumps:
    print('Downloading latest Wikipedia dump...')
    subprocess.run(['rm', LATEST_WP_DUMP_LOCAL])
    subprocess.call(['wget', LATEST_WP_DUMP_WEB])

--2023-03-03 09:16:56--  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:2:208:80:154:142, 208.80.154.142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:2:208:80:154:142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20680789666 (19G) [application/octet-stream]
Saving to: 'enwiki-latest-pages-articles.xml.bz2'

est-pages-articles.   0%[                    ]  65.16M  3.84MB/s    eta 78m 24s^C


In [None]:
# process most recent wikipedia dump

if extract_dumps:
    print('Extracting Wikipedia dump...')

    output_dir = 'wp_extracts'
    subprocess.run(['rm -rf', output_dir])
    subprocess.run(['mkdir', output_dir])

    templates_file = 'xml_templates.txt'
    subprocess.run(['rm', templates_file])

    args = [
        'python3', 'wikiextractor/WikiExtractor.py',
        '--output', output_dir,
        # '--json',
        '--templates', templates_file,
        LATEST_WP_DUMP_LOCAL
    ]
    completed_process = subprocess.run(args)

    if completed_process.returncode != 0:
        raise Exception(f'WikiExtractor.py exited with code {completed_process.returncode}')
    else:
        print(f'WikiExtractor.py completed successfully. Saved results to {output_dir}')

In [None]:
# extract single article from wp dump

if extract_dumps:
    print('Extracting test page from Wikipedia dump...')
    args = [
        'python3', 'wikiextractor/extractPage.py',
        '--id', '100',
        LATEST_WP_DUMP_LOCAL
    ]
    completed_process = subprocess.run(args)

    if completed_process.returncode != 0:
        raise Exception(f'ExtractPage.py exited with code {completed_process.returncode}')
    else:
        print(f'ExtractPage.py completed successfully.')

    completed_process.stdout

## Get and process Cirrus dump

In [3]:
# get path to most recent cirrus dump
LATEST_CIRRUS_DUMP_STDOUT = !curl https://dumps.wikimedia.org/other/cirrussearch/current/ | grep -o 'enwiki-[0-9]*-cirrussearch-general.json.gz' | sort -r | head -n 1
LATEST_CIRRUS_DUMP_WEB = LATEST_CIRRUS_DUMP_STDOUT[-1]
LATEST_CIRRUS_DUMP_LOCAL = 'enwiki-latest-cirrussearch-general.json.gz'

enwiki-20230227-cirrussearch-general.json.gz


In [5]:
# get that dump
if download_new_dumps:
    print('Downloading latest Cirrus dump...')
    subprocess.run(['rm', LATEST_CIRRUS_DUMP_LOCAL])
    suprocess.run(['wget', f'https://dumps.wikimedia.org/other/cirrussearch/current/{LATEST_CIRRUS_DUMP_WEB}'])
    subprocess.run(['mv', LATEST_CIRRUS_DUMP_WEB, LATEST_CIRRUS_DUMP_LOCAL])

--2023-03-03 09:24:56--  https://dumps.wikimedia.org/other/cirrussearch/current/enwiki-20230227-cirrussearch-general.json.gz
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:2:208:80:154:142, 208.80.154.142
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:2:208:80:154:142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53475861433 (50G) [application/octet-stream]
Saving to: 'enwiki-20230227-cirrussearch-general.json.gz'

.json.gz              8%[>                   ]   4.13G  3.94MB/s    eta 3h 30m ^C


In [None]:
# process most recent Cirrus dump

output_dir = 'cirrus_extracts'
subprocess.run(['rm -rf', output_dir])
subprocess.run(['mkdir', output_dir])

args = [
    'python3', 'wikiextractor/cirrus-extract.py',
    '--output', output_dir,
    LATEST_CIRRUS_DUMP_LOCAL
]
completed_process = subprocess.run(args)

if completed_process.returncode != 0:
    raise Exception(f'cirrus-extract.py exited with code {completed_process.returncode}')
else:
    print(f'cirrus-extract.py completed successfully. Saved results to {output_dir}')