## Imports

In [6]:
import subprocess

In [7]:
import sys
if not (sys.version_info[0] == 3 and sys.version_info[0] < 11):
    raise Exception('WikiExtractor.py currently breaks with Python 3.11. Please rerun with Python 3.10 or earlier.')

In [8]:
download_new_dumps = False # set to False if you have up-to-date dumps

# do you want to extract the dumps? set to True if yes
# set to False if you want to extract the XML dump page-by-page, or already have extracted the dumps
extract_dumps = True 

## Get and process Wikipedia XML dump

In [9]:
# define path to latest dump
LATEST_WP_DUMP_WEB = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
LATEST_WP_DUMP_LOCAL = 'enwiki-latest-pages-articles.xml.bz2'
LATEST_WP_DUMP_LOCAL_UNBZ = 'enwiki-latest-pages-articles.xml'

In [10]:
# get most recent wikipedia dump
if download_new_dumps:
    print('Downloading latest Wikipedia dump...')
    subprocess.run(['rm', LATEST_WP_DUMP_LOCAL])
    subprocess.call(['wget', LATEST_WP_DUMP_WEB])

In [11]:
# process most recent wikipedia dump

if extract_dumps:
    print('Extracting Wikipedia dump...')

    output_dir = 'wp_extracts'
    subprocess.run(['rm', '-rf', output_dir])
    subprocess.run(['mkdir', output_dir])

    templates_file = 'xml_templates.txt'
    subprocess.run(['rm', templates_file])

    args = [
        'python3', '-m', 'wikiextractor.WikiExtractor',
        LATEST_WP_DUMP_LOCAL_UNBZ,
        '--output', output_dir,
        # '--json',
        '--templates', templates_file
    ]
    completed_process = subprocess.run(args)

    if completed_process.returncode != 0:
        raise Exception(f'WikiExtractor.py exited with code {completed_process.returncode}')
    else:
        print(f'WikiExtractor.py completed successfully. Saved results to {output_dir}')

Extracting Wikipedia dump...


rm: xml_templates.txt: No such file or directory
INFO: Preprocessing 'enwiki-latest-pages-articles.xml' to collect template definitions: this may take some time.
INFO: Preprocessed 100000 pages
INFO: Preprocessed 200000 pages
INFO: Preprocessed 300000 pages
INFO: Preprocessed 400000 pages
INFO: Preprocessed 500000 pages
INFO: Preprocessed 600000 pages
INFO: Preprocessed 700000 pages
INFO: Preprocessed 800000 pages
INFO: Preprocessed 900000 pages
INFO: Preprocessed 1000000 pages
INFO: Preprocessed 1100000 pages
INFO: Preprocessed 1200000 pages
INFO: Preprocessed 1300000 pages
INFO: Preprocessed 1400000 pages
INFO: Preprocessed 1500000 pages
INFO: Preprocessed 1600000 pages
INFO: Preprocessed 1700000 pages
INFO: Preprocessed 1800000 pages
INFO: Preprocessed 1900000 pages
INFO: Preprocessed 2000000 pages
INFO: Preprocessed 2100000 pages
INFO: Preprocessed 2200000 pages
INFO: Preprocessed 2300000 pages
INFO: Preprocessed 2400000 pages
INFO: Preprocessed 2500000 pages
INFO: Preprocessed 260

KeyboardInterrupt: 

In [7]:
# extract single article from wp dump

if extract_dumps:
    print('Extracting test page from Wikipedia dump...')
    arg s = [
        'python3', '-m', 'wikiextractor/extractPage.py',
        '--id', '18630637',
        LATEST_WP_DUMP_LOCAL
    ]
    completed_process = subprocess.run(args)

    if completed_process.returncode != 0:
        raise Exception(f'ExtractPage.py exited with code {completed_process.returncode}')
    else:
        print(f'ExtractPage.py completed successfully.')

    completed_process.stdout

Extracting test page from Wikipedia dump...
  <page>
    <title>Translation</title>
    <ns>0</ns>
    <id>18630637</id>
    <revision>
      <parentid>1138406409</parentid>
      <timestamp>2023-02-09T14:35:44Z</timestamp>
      <contributor>
        <username>Girth Summit</username>
      </contributor>
      <minor />
      <comment>Reverted 1 edit by [[Special:Contributions/109.82.229.136|109.82.229.136]] ([[User talk:109.82.229.136|talk]]) to last revision by Nihil novi</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text bytes="157577" xml:space="preserve">{{Short description|Transfer of the meaning of something in one language into another}}
{{About|language translation|other uses}}
{{Redirect|Translator|other uses|Translator (disambiguation)}}
{{Distinguish|Transliteration}}
{{Pp-move-indef}}
{{Self reference|For article translations in Wikipedia, see [[Wikipedia:Translation]].}}
{{Use dmy dates|date=April 2019}}
[[File:Charles V ordonnant la t

## Get and process Cirrus dump

In [15]:
# get path to most recent cirrus dump
# note that this is sometimes missing, and you may have to manually check a previous cirrussearch file to get it
# for more info, see this link: https://phabricator.wikimedia.org/T330936
LATEST_CIRRUS_DUMP_STDOUT = !curl https://dumps.wikimedia.org/other/cirrussearch/current/ | grep -o 'enwiki-[0-9]*-cirrussearch-content.json.gz' | sort -r | head -n 1
LATEST_CIRRUS_DUMP_WEB = LATEST_CIRRUS_DUMP_STDOUT[-1]
LATEST_CIRRUS_DUMP_LOCAL = 'enwiki-latest-cirrussearch-content.json.gz'

In [16]:
# get that dump
if download_new_dumps:
    print('Downloading latest Cirrus dump...')
    subprocess.run(['rm', LATEST_CIRRUS_DUMP_LOCAL])
    suprocess.run(['wget', f'https://dumps.wikimedia.org/other/cirrussearch/current/{LATEST_CIRRUS_DUMP_WEB}'])
    subprocess.run(['mv', LATEST_CIRRUS_DUMP_WEB, LATEST_CIRRUS_DUMP_LOCAL])

In [23]:
# process most recent Cirrus dump

output_dir = 'cirrus_extracts'
subprocess.run(['rm', '-rf', output_dir])
subprocess.run(['mkdir', output_dir])

args = [
    'python3', '-m', 'wikiextractor.cirrus-extract',
    '--output', output_dir,
    LATEST_CIRRUS_DUMP_LOCAL
]
completed_process = subprocess.run(args)

if completed_process.returncode != 0:
    raise Exception(f'cirrus-extract.py exited with code {completed_process.returncode}')
else:
    print(f'cirrus-extract.py completed successfully. Saved results to {output_dir}')

cirrus-extract.py completed successfully. Saved results to cirrus_extracts


## Alternative approach with wp2txt

See https://github.com/yohasebe/wp2txt 
- Requires a functional non-system Ruby. On MacOS, try these instructions: 
  - https://www.moncefbelyamani.com/how-to-install-xcode-homebrew-git-rvm-ruby-on-mac/?utm_source=stackoverflow&utm_campaign=51126403
- Only install wp2txt once you have a functional non-system Ruby

In [None]:
# ensure you have non-system ruby and ruby gems
# make sure this does not point towards usr/bin/ruby and usr/bin/gem
!which ruby 
!which gem

In [None]:
# installation
!gem install wp2txt

In [None]:
# make needed directories
!mkdir xml 
!mkdir text
!mkdir category 
!mkdir summary 

In [None]:
# define path to latest dump
LATEST_WP_DUMP_WEB = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
LATEST_WP_DUMP_LOCAL = 'enwiki-latest-pages-articles.xml.bz2'

In [None]:
# get most recent wikipedia dump
if download_new_dumps:
    print('Downloading latest Wikipedia dump...')
    subprocess.run(['rm', LATEST_WP_DUMP_LOCAL])
    subprocess.call(['wget', LATEST_WP_DUMP_WEB])

In [None]:
# decompress and split into intermediate XML files
# may require SUDO
!wp2txt --no-convert -i ./enwiki-20220801-pages-articles.xml.bz2 -o ./xml

In [None]:
# extract all text, category info, and summary
# may require SUDO
!wp2txt -i ./xml -o ./text
!wp2txt -g -i ./xml -o ./category
!wp2txt -s -i ./xml -o ./summary