# Dowload and clean german Wikipedia dump

Dieses Notebook ist das Hauptnotebook, welches den Wikipedia Dump herunterlädt, diesen aufsplittet in exzellente und nicht exzellente artikel und anschließend eine grundlegende Datenaufbereitung durchführt. Die Aufgaben 1 & 2 sind für eine bessere Übersicht in den folgenden seperaten Notebooks bearbeitet worden:

Aufgabe 1: [Klassifizierung der Artikel](classification.ipynb)

Aufgabe 2: [Keyword extraktion](keywords.ipynb)

### Install packages

In [7]:
! pip install -r requirements.txt

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.9 -m pip install --upgrade pip[0m


### Import Packages

In [1]:
# imports

# regex
import re
import sys
import os
import bz2
import requests
import shutil
import csv

# package to read wikipedia dump
import mwxml
# packages for cleaning the data
import html2text
import wikitextparser as wtp

import textstat

# packages for multithreading
from threading import Thread


### Static Variables / Config

In [2]:
# static var
DUMP_URL = 'https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2'
DUMP_FILE_ZIP = './dewiki-latest-pages-articles.xml.bz2'
DUMP_FILE_ENTPACKT = './dewiki-latest-pages-articles.xml'

EXZELLENT_FOLDER = './data/exzellent'
NOT_EXZELLENT_FOLDER = './data/not_exzellent'
SUBSET_FOLDER = './data/subset'

CSV_FILE = './articles_meta.csv'

### Download XML Dump herunterladen und Chunkweise abspeichern

Herunterladen des Wikipedia Dumps mit allen deutschsprachigen Artikeln von wikimedia.

In [None]:
# Funktion zum Herunterladen der Datei
def download_file(url, file_path):
    response = requests.get(url, stream=True)
    with open(file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)

# Herunterladen des Wikipedia-Artikeldumps
download_file(DUMP_URL, DUMP_FILE_ZIP)

### XML Dump entpacken

In [11]:
# unzip the xml-dump and save it
with open(DUMP_FILE_ENTPACKT, 'wb') as new_file, bz2.BZ2File(DUMP_FILE_ZIP, 'rb') as file:
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(data)

### Artikel aufbereiten und sortieren nach Label

Ein Beispiel-Artikel vor der Aufbereitung kann hier betrachtet werden: [explanation/160.xml](./explanation/160.xml) 

Der selbe Artikel nach der Aufbereitung finden Sie hier: [explanation/160.txt](./explanation/160.txt)

In [3]:
class CleanSaveArticleThread(Thread):
    def __init__(self, *args):
        Thread.__init__(self)
        self.page = args[0]
        self.revision = args[1]
        self.is_excellent = args[2]

        self.number_images = 0
        self.number_citations = 0
        self.number_headers = 0
        self.number_links = 0
        self.number_categories = 0

        self.saved = False

        textstat.set_lang("de")

    # override the run function
    def run(self):
        
        text = self.revision.text

        # filter if article is only redirect and has no text 
        PATTERN_REDIRECT = r"(#REDIRECT|#redirect|#WEITERLEITUNG)"
        
        if re.search(PATTERN_REDIRECT, self.revision.text):
            # with open(os.path.join('./data/trash', str(page.id) + '.txt'), "x") as f:
            #     f.write(page.title + "\n" + text)
            self.saved = False
            return


        # feature extraction for classification task
        # count images in article
        PATTERN_IMAGES = r"\[\[Datei:[^\]]+\.(?:jpg|png|svg)[^\]]+\]\]"
        self.number_images = len(re.findall(PATTERN_IMAGES, self.revision.text))

        # count citations in article
        PATTERN_CITATIONS = r"\/ref"
        self.number_citations = len(re.findall(PATTERN_CITATIONS, self.revision.text))

        # count headers
        PATTERN_HEADER = r"==+ (.*?) ==+"
        self.number_headers = len(re.findall(PATTERN_HEADER, self.revision.text))

        # count link to other wikipedia articles
        PATTERN_LINK = r"\[\[(?!(?:.*\bDatei:\b.*|.*Kategorie:))([^]]+)\]\]"
        self.number_links = len(re.findall(PATTERN_LINK, self.revision.text))

        # count categories of the article
        PATTERN_CATEGORIE = r"\[\[Kategorie:[^\]]+\]\]"
        self.number_categories = len(re.findall(PATTERN_CATEGORIE, self.revision.text))


        # text cleanup
        # entnommen aus: https://github.com/daveshap/PlainTextWikipedia
        try:
            # Plain Text
            text = wtp.parse(text).plain_text()  
            # Remove HTML
            text = html2text.html2text(text)
        
            # Replace newlines
            text = text.replace('\\n', ' ')
            # Replace excess whitespace
            text = re.sub('\s+', ' ', text)
        except:
            self.saved = False
            return
        # end entnommen aus

        # calculate metrics / features for classification task
        # count number of words
        self.number_words = textstat.lexicon_count(text, removepunct=True)

        # count number of scentens
        self.number_scentens = textstat.sentence_count(text)

        try:
            # calculate Wiener Sachtextformel
            self.wiener_sachtextformel = textstat.wiener_sachtextformel(text, variant=1)
        except:
            self.saved = False
            return



        # save articles as txt file in correct folder
        if self.is_excellent:
            # filter excellent label from article (just to be sure is not in article anymore - usually the html2text function filtes these tags)
            text = text.replace('\{\{Exzellent|', '\{\{')
            # set target folder based on label
            target_folder = EXZELLENT_FOLDER
        else: 
            # set target folder based on label
            target_folder = NOT_EXZELLENT_FOLDER
        
        # save in target folder and add Wikipedia title in first line of document
        with open(os.path.join(target_folder, str(self.page.id) + '.txt'), "x") as f:
            f.write(self.page.title + "\n" + text)
            f.close()
            
        self.saved = True

In [4]:
def write_meta_csv(thread: CleanSaveArticleThread):
    # check if article is saved
    if thread.saved:
        # write meta data to csv file
        with open(CSV_FILE, 'a') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([
                thread.page.id, 
                thread.is_excellent, 
                thread.number_images, 
                thread.number_citations, 
                thread.number_headers, 
                thread.number_links, 
                thread.number_categories,
                thread.number_words,
                thread.number_scentens,
                thread.wiener_sachtextformel
                ])
            csv_file.close()

In [10]:
# create folder if not exists
print ("removing existing folders and files")
if os.path.exists(EXZELLENT_FOLDER):
    shutil.rmtree(EXZELLENT_FOLDER)
os.makedirs(EXZELLENT_FOLDER)

if os.path.exists(NOT_EXZELLENT_FOLDER):
    shutil.rmtree(NOT_EXZELLENT_FOLDER)
os.makedirs(NOT_EXZELLENT_FOLDER)

# create csv file for meta data
header = [
    'article_id',
    'is_excellent',
    'number_images',
    'number_citations',
    'number_headers',
    'number_links',
    'number_categories',
    'number_words',
    'number_scentens', 
    'wiener_sachtextformel'
    ]

if os.path.exists(CSV_FILE):
    os.remove(CSV_FILE)

with open(CSV_FILE, 'w+') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header)

pattern = r"\{\{Exzellent\|"
dump = mwxml.Dump.from_file(open(DUMP_FILE_ENTPACKT))

excellent_count = 0
not_excellent_count = 0
cleaned_saved = 0

thread_list = []

print("### Wikipedia Dump ###")
print(dump.site_info.name, dump.site_info.dbname)

print("### Exzelente Aritkel ###")
if not os.path.isdir(EXZELLENT_FOLDER):
    os.makedirs(EXZELLENT_FOLDER)
    
# for schleifen entnommen aus: 
for idx_page, page in enumerate(dump):
    for idx_revision, revision in enumerate(page):
        if revision.text is not None:

            x = re.search(pattern, revision.text)
            if x is not None:
                # finde the excellent articles
                excellent_count += 1
                is_excellent = True
            else:
                not_excellent_count += 1
                is_excellent= False

            # start basic cleaning of article in seperated Thread for better performance
            new_thread = CleanSaveArticleThread(page, revision, is_excellent)
            new_thread.start()
            thread_list.append(new_thread)

            new_thread.join()
            
            # update output
            sys.stdout.write('\r -reading- Erfasste Artikel: ' + str(excellent_count + not_excellent_count) + ' davon exzellent: ' + str(excellent_count) + ' gespeichert: ' + str(cleaned_saved))
            sys.stdout.flush()


            # save all information in csv file
            if(len(thread_list) >= 500):
                sys.stdout.write('\r Erfasste Artikel: ' + str(excellent_count + not_excellent_count) + ' davon exzellent: ' + str(excellent_count) + ' gespeichert: ' + str(cleaned_saved))
                sys.stdout.flush()

                for thread in thread_list:
                    # wait until thread ist done
                    thread.join()
                    
                    write_meta_csv(thread)

                    cleaned_saved += 1

                # remove all threads 
                thread_list = []

# save remaining article meta to csv
for thread in thread_list:
    # wait until thread ist done
    thread.join()
    
    write_meta_csv(thread)

    cleaned_saved += 1

print('\n Anzahl der exzellenten Artikel: ', str(excellent_count))

removing existing folders and files
### Wikipedia Dump ###
Wikipedia dewiki
### Exzelente Aritkel ###
 -reading- Erfasste Artikel: 41251 davon exzellent: 563 gespeichert: 41000

### Subset generieren

In [20]:
# number overall articles
SUBSET_SIZE = 3000

if not os.path.exists(SUBSET_FOLDER):
    os.mkdir(SUBSET_FOLDER)

    articles_exzellent = os.listdir(EXZELLENT_FOLDER)
    articles_not_exzellent = os.listdir(NOT_EXZELLENT_FOLDER)

    number_exzellent:int = len(articles_exzellent)
    number_not_exzellent:int = len(articles_not_exzellent)

    ratio:float = float((number_exzellent / number_not_exzellent))

    print("Gesamt Anzahl exzellenter Artikel: ", number_not_exzellent)
    print("Gesamt Anzahl nicht exzellenter Artikel: ", number_exzellent)
    print("Gesamtanzahl Artikel: ", (number_exzellent + number_not_exzellent))

    for idx, article in enumerate(articles_exzellent):
        shutil.copyfile(os.path.join(EXZELLENT_FOLDER,article), os.path.join(SUBSET_FOLDER,'exzellent',article))
        if idx >= int(ratio * SUBSET_SIZE):
            print("Anzahl exzellente Artikel Subset: ", idx + 1)
            break

    for idx, article in enumerate(articles_not_exzellent):
        shutil.copyfile(os.path.join(NOT_EXZELLENT_FOLDER,article), os.path.join(SUBSET_FOLDER,'not_exzellent',article))
        if idx >= int((1 - ratio) * SUBSET_SIZE):
            print("Anzahl nicht exzellente Artikel Subset: ", idx + 1)
            break

Gesamt Anzahl exzellenter Artikel:  2666629
Gesamt Anzahl nicht exzellenter Artikel:  2689
Gesamtanzahl Artikel:  2669318
Anzahl exzellente Artikel Subset:  4
Anzahl nicht exzellente Artikel Subset:  2997
