# Dowload and clean german Wikipedia dump

### Install packages

In [1]:
pip install -r requirements.txt

### Import Packages

In [2]:
# imports

# regex
import re
import sys
import os
import bz2
import requests

# package to read wikipedia dump
import mwxml
# packages for cleaning the data
import html2text
import wikitextparser as wtp

# packages for multithreading
from threading import Thread


### Static Variables / Config

In [24]:
# static var
DUMP_URL = 'https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2'
DUMP_FILE_ZIP = './dewiki-latest-pages-articles.xml.bz2'
DUMP_FILE_ENTPACKT = './dewiki-latest-pages-articles.xml'
EXZELLENT_FOLDER = './exzellent'


### Funktionen definieren

In [25]:
def clean_save_article(page, revision):
    text = revision.text

    # entnommen aus: https://github.com/daveshap/PlainTextWikipedia
    # Plain Text
    text = wtp.parse(text).plain_text()  
    # Remove HTML
    text = html2text.html2text(text)
    # Replace newlines
    text = text.replace('\\n', ' ') 
    # Replace excess whitespace
    text = re.sub('\s+', ' ', text)  
    # end entnommen aus
    
    with open(os.path.join(EXZELLENT_FOLDER, str(page.id) + '.txt'), "x") as f:
        f.write(page.title + "\n" + text)

### Download XML Dump herunterladen und Chunkweise abspeichern

Herunterladen des Wikipedia Dumps mit allen deutschsprachigen Artikeln von wikimedia.

In [None]:
# Funktion zum Herunterladen der Datei
def download_file(url, file_path):
    response = requests.get(url, stream=True)
    with open(file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)

# Herunterladen des Wikipedia-Artikeldumps
download_file(DUMP_URL, DUMP_FILE_ZIP)

### XML Dump entpacken

In [None]:
# unzip the xml-dump and save it
with open(DUMP_FILE_ENTPACKT, 'wb') as new_file, bz2.BZ2File(DUMP_FILE_ZIP, 'rb') as file:
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(data)

In [None]:
# pythonhosted.org
pattern = r"\{\{Exzellent\|"
dump = mwxml.Dump.from_file(open(DUMP_FILE_ENTPACKT))

print("### Wikipedia Dump ###")
print(dump.site_info.name, dump.site_info.dbname)

print("### Exzelente Aritkel ###")
if not os.path.isdir(EXZELLENT_FOLDER):
    os.makedirs(EXZELLENT_FOLDER)
excellent_count = 0
    
# for schleifen entnommen aus: 
for idx_page, page in enumerate(dump):
    for idx_revision, revision in enumerate(page):
        if revision.text is not None:
            x = re.search(pattern, revision.text)
            if x is not None:
                # finde the excellent articles
                excellent_count += 1
                Thread(target=clean_save_article, args=(page, revision)).start()
                sys.stdout.write('\rGefundene exzellente Artikel: ' + str(excellent_count))
                sys.stdout.flush()

print('')
print('Anzahl der exzellenten Artikel: ', str(excellent_count))

### Wikipedia Dump ###
Wikipedia dewiki
### Exzelente Aritkel ###
Gefundene exzellente Artikel: 2793Anzahl der exzellenten Artikel:  2793
