In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os

In [2]:
PATH_WIKI_XML = 'data/'
FILENAME_WIKI = 'enwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"

In [3]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [4]:
def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

In [5]:
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)

In [6]:
totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None
start_time = time.time()

In [23]:
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'redirect', 'text'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    templateWriter.writerow(['id', 'title'])

    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)

        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                # Do not pick up on revision id's
                inrevision = True
        else:
            if tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision:
                id = int(elem.text)
            elif tname == 'redirect':
                redirect = elem.attrib['title']
            elif tname == 'ns':
                ns = int(elem.text)
            elif tname == 'text':
                text = elem.text

            elif tname == 'page':
                totalCount += 1

                if ns == 10:
                    templateCount += 1
                    templateWriter.writerow([id, title])
                elif len(redirect) > 0:
                    articleCount += 1
                    articlesWriter.writerow([id, title, redirect])
                else:
                    redirectCount += 1
                    redirectWriter.writerow([id, title, redirect])

In [24]:
 if totalCount > 1 and (totalCount % 100000) == 0:
        print("{:,}".format(totalCount))

elem.clear()

In [25]:
elapsed_time = time.time() - start_time

print("Total pages: {:,}".format(totalCount))
print("Template pages: {:,}".format(templateCount))
print("Article pages: {:,}".format(articleCount))
print("Redirect pages: {:,}".format(redirectCount))
print("Elapsed time: {}".format(hms_string(elapsed_time)))

Total pages: 101,953
Template pages: 4,696
Article pages: 41,756
Redirect pages: 55,501
Elapsed time: 0:07:40.98


In [26]:
import pandas as pd

In [27]:
articles_df = pd.read_csv('data/articles.csv')

In [28]:
articles_df.head(2)

Unnamed: 0,id,title,redirect,text
0,60663471,Fashion Studies Journal,Parsons School of Design,
1,60663483,Louis James Russell,Louis J. Russell,


In [29]:
templates_df = pd.read_csv('data/articles_template.csv')

In [30]:
templates_df.head()

Unnamed: 0,id,title
0,60663814,Template:2019 Conference Carolinas men's volle...
1,60663980,Template:Taxonomy/Calippus
2,60664005,Template:United States FIFA World Cup record
3,60664024,Template:Atlantic Sun Conference baseball navbox
4,60664050,Template:LDS Temple/Praia Cape Verde Temple


In [31]:
redirect_df = pd.read_csv('data/articles_redirect.csv')

In [32]:
redirect_df.head()

Unnamed: 0,id,title,redirect
0,60663466,File:Panic Switch - Silversun Pickups.ogg,
1,60663474,Wikipedia:Miscellany for deletion/Portal:Haske...,
2,60663475,Louis J. Russell,
3,60663490,File:Tulia by Tulia.jpg,
4,60663492,File:All in a Nutshell.jpg,
