In [1]:
import bz2
import subprocess
import xml.sax
import mwparserfromhell
import networkx as nx
import matplotlib.pyplot as plt
import gc
import json
import os
from functools import partial
from multiprocessing import Pool 
import tqdm 
from timeit import default_timer as timer

In [2]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [3]:
def find_links(data_path, save = True):
    handler = WikiXmlHandler()
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    for i, line in enumerate(subprocess.Popen(['bzcat'], 
                             stdin = open(f'./partitioned-data/{data_path}'), 
                             stdout = subprocess.PIPE).stdout):
        try:
            parser.feed(line)
        except StopIteration:
            break
    
    if save:
        new_data_path = data_path[0:data_path.find('xml')-1] + '-' + data_path[data_path.find('xml') + 4:-4]
        m = {}
        for page in handler._pages:
            wikipage = mwparserfromhell.parse(page[1])
            wikilinks = [str(x.title) for x in wikipage.filter_wikilinks()]
            m[page[0]] = wikilinks
            
        with open(f'./json-data/{new_data_path}.json', 'w') as fout:
            fout.write(json.dumps(m))

        partition_dir = './json-data'
        print(f'{ len(os.listdir(partition_dir)) } files processed.', end = '\r')

    # Memory management
    del handler
    del parser
    gc.collect()
    return None

In [30]:
# partitions = os.listdir('./partitioned-data')
# partitions.remove('.DS_Store')
partitions = ['enwiki-latest-pages-articles16.xml-p11018049p11539266.bz2',
 'enwiki-latest-pages-articles18.xml-p15193074p16120542.bz2',
 'enwiki-latest-pages-articles3.xml-p88445p200509.bz2',
 'enwiki-latest-pages-articles23.xml-p28323661p29823660.bz2',
 'enwiki-latest-pages-articles19.xml-p16120543p17620542.bz2',
 'enwiki-latest-pages-articles21.xml-p22722157p23927983.bz2',
 'enwiki-latest-pages-articles27.xml-p57663462p59163461.bz2',
 'enwiki-latest-pages-articles27.xml-p47163462p48663461.bz2',
 'enwiki-latest-pages-articles22.xml-p25427984p26823660.bz2',
 'enwiki-latest-pages-articles27.xml-p50163462p51663461.bz2',
 'enwiki-latest-pages-articles27.xml-p53163462p54663461.bz2',
 'enwiki-latest-pages-articles27.xml-p56163462p57663461.bz2',
 'enwiki-latest-pages-articles26.xml-p39567203p41067202.bz2',
 'enwiki-latest-pages-articles21.xml-p21222157p22722156.bz2',
 'enwiki-latest-pages-articles7.xml-p892913p1268691.bz2',
 'enwiki-latest-pages-articles26.xml-p38067203p39567202.bz2',
 'enwiki-latest-pages-articles18.xml-p13693074p15193073.bz2',
 'enwiki-latest-pages-articles9.xml-p1791080p2336422.bz2',
 'enwiki-latest-pages-articles8.xml-p1268692p1791079.bz2',
 'enwiki-latest-pages-articles10.xml-p2336423p3046512.bz2',
 'enwiki-latest-pages-articles15.xml-p7744801p9244800.bz2',
 'enwiki-latest-pages-articles13.xml-p5040437p6197594.bz2',
 'enwiki-latest-pages-articles12.xml-p3926862p5040436.bz2',
 'enwiki-latest-pages-articles4.xml-p200510p352689.bz2']
npartitions = []
for data_path in partitions:
    if data_path[0:data_path.find('xml')-1] + '-' + data_path[data_path.find('xml') + 4:-4] + '.json' in os.listdir('./json-data'):
        partitions.remove(data_path)
    else:
        npartitions += [data_path]

partitions = npartitions
# print(len(partitions))
# partitions = partitions[0:8]

In [32]:
# Create a pool of workers to execute processes
pool = Pool(processes = 8)

start = timer()

# Map (service, tasks), applies function to each partition
results = pool.map(find_links, partitions)

pool.close()
pool.join()

end = timer()
print(f'{end - start} seconds elapsed.')

40 files processed.34 files processed.35 files processed.36 files processed.37 files processed.39 files processed.38 files processed.4436.835368284 seconds elapsed.


In [None]:
# start = timer()
# pool = Pool(processes = 8)
# results = []

# # Run partitions in parallel
# for x in tqdm.tqdm_notebook(pool.imap_unordered(find_links, partitions), total = len(partitions)):
#     results.append(x)
    
# pool.close()
# pool.join()

# end = timer()
# print(f'{end - start} seconds elapsed.')

In [34]:
set([data_path[0:data_path.find('xml')-1] + '-' + data_path[data_path.find('xml') + 4:-4] + '.json' for data_path in os.listdir('./partitioned-data/')]) - set(os.listdir('./json-data'))

{'.DS_Sto-_S.json',
 'enwiki-latest-pages-articles11-p3046513p3926861.json',
 'enwiki-latest-pages-articles14-p6197595p7697594.json',
 'enwiki-latest-pages-articles16-p9518049p11018048.json',
 'enwiki-latest-pages-articles17-p11539267p13039266.json',
 'enwiki-latest-pages-articles22-p23927984p25427983.json',
 'enwiki-latest-pages-articles23-p26823661p28323660.json',
 'enwiki-latest-pages-articles24-p30503451p32003450.json',
 'enwiki-latest-pages-articles24-p32003451p33503450.json',
 'enwiki-latest-pages-articles25-p33952816p35452815.json',
 'enwiki-latest-pages-articles25-p35452816p36952815.json',
 'enwiki-latest-pages-articles26-p41067203p42567202.json',
 'enwiki-latest-pages-articles27-p42663462p44163461.json',
 'enwiki-latest-pages-articles27-p44163462p45663461.json',
 'enwiki-latest-pages-articles27-p48663462p50163461.json',
 'enwiki-latest-pages-articles27-p51663462p53163461.json',
 'enwiki-latest-pages-articles27-p54663462p56163461.json',
 'enwiki-latest-pages-articles27-p5916346

In [35]:
find_links('enwiki-latest-pages-articles4.xml-p200510p352689.bz2')

40 files processed.