In [1]:
import bz2

data_path = "idwiki-20240201-pages-articles.xml.bz2"
data_path

'idwiki-20240201-pages-articles.xml.bz2'

# Parsing XML

Menggunakan library bz2
Sumber: https://github.com/WillKoehrsen/wikipedia-data-science/tree/master

In [2]:
lines = []

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    lines.append(line)
    if i > 1e6:
        break

lines[:1000]

[b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="id">\n',
 b'  <siteinfo>\n',
 b'    <sitename>Wikipedia</sitename>\n',
 b'    <dbname>idwiki</dbname>\n',
 b'    <base>https://id.wikipedia.org/wiki/Halaman_Utama</base>\n',
 b'    <generator>MediaWiki 1.42.0-wmf.15</generator>\n',
 b'    <case>first-letter</case>\n',
 b'    <namespaces>\n',
 b'      <namespace key="-2" case="first-letter">Media</namespace>\n',
 b'      <namespace key="-1" case="first-letter">Istimewa</namespace>\n',
 b'      <namespace key="0" case="first-letter" />\n',
 b'      <namespace key="1" case="first-letter">Pembicaraan</namespace>\n',
 b'      <namespace key="2" case="first-letter">Pengguna</namespace>\n',
 b'      <namespace key="3" case="first-letter">Pembicaraan Pengguna</namespace>\n',
 b'      <namespace ke

In [3]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

## Mendapatkan judul dari artikel

In [4]:
handler = WikiXmlHandler()

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    parser.feed(line)
    if len(handler._pages) > 2:
        break

print([x[0] for x in handler._pages])

['Asam deoksiribonukleat', 'Asam Deoksiribosanukleat', 'Anwar Sadat']


# Parsing Artikel

Menggunakan library mwparserfromhell, untuk mendapatkan kategori setiap halaman

In [5]:
import mwparserfromhell 

print(handler._pages[0][0])

# Create the wiki article
wiki = mwparserfromhell.parse(handler._pages[0][1])

Asam deoksiribonukleat


In [6]:
templates = wiki.filter_wikilinks()
for template in templates:
    if template.startswith("[[Kategori:"):
        print(template)

[[Kategori:Genetika molekular|Nukleat asam DNA]]
[[Kategori:Asam nukleat]]


Mendapatkan halaman kategori Sejarah Indonesia

In [7]:
from timeit import default_timer as timer
from tqdm import tqdm
import os

In [8]:
class SejarahIndonesiaHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page' and self._values['title'] == 'Kategori:Sejarah Indonesia':
            self._pages.append((self._values['title'], self._values['text']))

In [9]:
sihandler = SejarahIndonesiaHandler()

siparser = xml.sax.make_parser()
siparser.setContentHandler(sihandler)

In [11]:
start = timer()
for i, line in enumerate(tqdm(bz2.BZ2File(data_path, 'r'))):
    siparser.feed(line)
end = timer()

print(f'Waktu run: {round(end - start)} detik')

2487458it [00:19, 154903.63it/s]

In [None]:
# Create the wiki article
wiki = mwparserfromhell.parse(sihandler._pages[0][1])
wiki

'{{Commonscat|History of Indonesia}} \n {{cattree|Sejarah Indonesia}} \n {{Sejarahnegara|negara=Indonesia|benua=Asia Tenggara}} \n {{Indonesia|navbar=plain|prefix=:Kategori:Sejarah|title=Daftar sejarah di Indonesia menurut provinsi (kategori)|image=}}'

In [None]:
sihandler._pages[0][1]

'{{Commonscat|History of Indonesia}} \n {{cattree|Sejarah Indonesia}} \n {{Sejarahnegara|negara=Indonesia|benua=Asia Tenggara}} \n {{Indonesia|navbar=plain|prefix=:Kategori:Sejarah|title=Daftar sejarah di Indonesia menurut provinsi (kategori)|image=}}'

Permasalahan:
1. Tidak dapat mengambil subkategori dari halaman kategori Sejarah Indonesia.
2. Beberapa halaman tidak memiliki informasi super-kategori dari setiap kategori. ex: https://id.wikipedia.org/wiki/Pembantaian_Santa_Cruz tidak memiliki kategori Sejarah Indonesia, padahal Orde Baru merupakan subkategori dari Sejarah Indonesia

Solusi:
1. Menambahkan kategori secara manual

Mendapatkan halaman dengan kategori Orde Baru

In [None]:
def process_article(title, text, timestamp, kategoris = ['Order Baru', 'Tokoh Orde Baru']):
    wikicode = mwparserfromhell.parse(text)
    wikilinks = wikicode.filter_wikilinks()

    if any(f"[[Kategori:{kategori}]]" in wikilinks for kategori in kategoris):
        templates = wikicode.filter_templates(matches="^Infobox")
        properties = None

        if len(templates) >= 1:
            properties = {param.name.strip_code().strip(): param.value.strip_code().strip()
                          for param in templates[0].params
                          if param.value.strip_code().strip()}
        return (title, text, timestamp, properties)
    return None

class OrdeBaruHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []
        self._infobox_pages = []
        self._infoboxless_pages = []
        self._article_count = 0

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        elif name == 'page':
            self._article_count += 1
            page = process_article(**self._values)
            if page:
                # print(page[0], end=" ")
                if page[-1]:
                    # print("Tidak ada infobox")
                    self._infoboxless_pages.append(page)
                else:
                    # print("Ada infobox")
                    self._infobox_pages.append(page)
                self._pages.append(page)

In [None]:
obhandler = OrdeBaruHandler()

obparser = xml.sax.make_parser()
obparser.setContentHandler(obhandler)

In [None]:
start = timer()
for i, line in enumerate(tqdm(bz2.BZ2File(data_path, 'r'))):
    obparser.feed(line)
end = timer()

print(f'Waktu run: {round(end - start)} detik')

Hamengkubuwana IX Ada infobox
Soeharto Ada infobox
Bob Hasan Ada infobox
Siti Hardijanti Rukmana Ada infobox
Siti Hartinah Ada infobox
Try Sutrisno Ada infobox
Abdul Haris Nasution Ada infobox
Basuki Rahmat Ada infobox
R. Hartono Ada infobox
Akbar Tanjung Ada infobox
Soedharmono Ada infobox
Wiranto Ada infobox
Harmoko Ada infobox
Tommy Soeharto Ada infobox
Radius Prawiro Ada infobox
Ali Moertopo Ada infobox
Probosutedjo Ada infobox
Sudono Salim Ada infobox


In [None]:
print(f'Jumlah artikel: {obparser._article_count}')

print(f'\nArtikel tanpa infobox: {obparser._infoboxless_pages}')

In [None]:
for page in obparser._infobox_pages:
    print(page)