In [1]:
import bz2

data_path = "idwiki-20240201-pages-articles.xml.bz2"
data_path

'idwiki-20240201-pages-articles.xml.bz2'

# Parsing XML

Menggunakan library bz2
Sumber: https://github.com/WillKoehrsen/wikipedia-data-science/tree/master

In [2]:
lines = []

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    lines.append(line)
    if i > 1e6:
        break

lines[:1000]

[b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="id">\n',
 b'  <siteinfo>\n',
 b'    <sitename>Wikipedia</sitename>\n',
 b'    <dbname>idwiki</dbname>\n',
 b'    <base>https://id.wikipedia.org/wiki/Halaman_Utama</base>\n',
 b'    <generator>MediaWiki 1.42.0-wmf.15</generator>\n',
 b'    <case>first-letter</case>\n',
 b'    <namespaces>\n',
 b'      <namespace key="-2" case="first-letter">Media</namespace>\n',
 b'      <namespace key="-1" case="first-letter">Istimewa</namespace>\n',
 b'      <namespace key="0" case="first-letter" />\n',
 b'      <namespace key="1" case="first-letter">Pembicaraan</namespace>\n',
 b'      <namespace key="2" case="first-letter">Pengguna</namespace>\n',
 b'      <namespace key="3" case="first-letter">Pembicaraan Pengguna</namespace>\n',
 b'      <namespace ke

In [3]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

## Mendapatkan judul dari artikel

In [4]:
handler = WikiXmlHandler()

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    parser.feed(line)
    if len(handler._pages) > 2:
        break

print([x[0] for x in handler._pages])

['Asam deoksiribonukleat', 'Asam Deoksiribosanukleat', 'Anwar Sadat']


# Parsing Artikel

Menggunakan library mwparserfromhell, untuk mendapatkan kategori setiap halaman

In [5]:
import mwparserfromhell 

print(handler._pages[0][0])

# Create the wiki article
wiki = mwparserfromhell.parse(handler._pages[0][1])

Asam deoksiribonukleat


In [6]:
wikilinks = wiki.filter_wikilinks()
for wikilink in wikilinks:
    if wikilink.startswith("[[Kategori:"):
        print(wikilink)

[[Kategori:Genetika molekular|Nukleat asam DNA]]
[[Kategori:Asam nukleat]]


In [7]:
templates = wiki.filter_templates(matches="cite journal")
for template in templates:
    print(template)

{{cite journal |author=Mashaghi A, Katan A |title=A physicist's view of DNA |journal=De Physicus|volume=24e |issue=3 |pages=59–61 |year=2013 | arxiv= 1311.2545v1 |bibcode=2013arXiv1311.2545M }}
{{cite journal | author = Ghosh A, Bansal M | title = A glossary of DNA structures from A to Z | journal = Acta Crystallogr D | volume = 59 | issue = 4 | pages = 620–6 | year = 2003 | pmid = 12657780 | doi = 10.1107/S0907444903003251 }}
{{cite journal | author = Watson JD, Crick FH | title = A Structure for Deoxyribose Nucleic Acid | journal = Nature | volume = 171 | issue = 4356 | pages = 737–738 | year = 1953 | pmid = 13054692 | doi = 10.1038/171737a0 | url = http://www.nature.com/nature/dna50/watsoncrick.pdf | format = PDF | accessdate = 4 May 2009 | bibcode = 1953Natur.171..737W }}
{{cite journal | author = Yakovchuk P, Protozanova E, Frank-Kamenetskii MD | title = Base-stacking and base-pairing contributions into thermal stability of the DNA double helix | journal = Nucleic Acids Res. | vol

Mendapatkan halaman kategori Sejarah Indonesia

In [8]:
from timeit import default_timer as timer
from tqdm import tqdm
import os
import csv

In [9]:
class SejarahIndonesiaHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page' and self._values['title'] == 'Kategori:Sejarah Indonesia':
            self._pages.append((self._values['title'], self._values['text']))

In [10]:
sihandler = SejarahIndonesiaHandler()

siparser = xml.sax.make_parser()
siparser.setContentHandler(sihandler)

In [11]:
start = timer()
for i, line in enumerate(tqdm(bz2.BZ2File(data_path, 'r'))):
    siparser.feed(line)
end = timer()

print(f'Waktu run: {round(end - start)} detik')

2157it [00:00, 21466.94it/s]

85572820it [13:41, 104121.39it/s]

Waktu run: 822 detik





In [12]:
# Create the wiki article
wiki = mwparserfromhell.parse(sihandler._pages[0][1])
wiki

'{{Commonscat|History of Indonesia}} \n {{cattree|Sejarah Indonesia}} \n {{Sejarahnegara|negara=Indonesia|benua=Asia Tenggara}} \n {{Indonesia|navbar=plain|prefix=:Kategori:Sejarah|title=Daftar sejarah di Indonesia menurut provinsi (kategori)|image=}}'

In [13]:
sihandler._pages[0][1]

'{{Commonscat|History of Indonesia}} \n {{cattree|Sejarah Indonesia}} \n {{Sejarahnegara|negara=Indonesia|benua=Asia Tenggara}} \n {{Indonesia|navbar=plain|prefix=:Kategori:Sejarah|title=Daftar sejarah di Indonesia menurut provinsi (kategori)|image=}}'

Permasalahan:
1. Tidak dapat mengambil subkategori dari halaman kategori Sejarah Indonesia.
2. Beberapa halaman tidak memiliki informasi super-kategori dari setiap kategori. ex: https://id.wikipedia.org/wiki/Pembantaian_Santa_Cruz tidak memiliki kategori Sejarah Indonesia, padahal Orde Baru merupakan subkategori dari Sejarah Indonesia

Solusi:
1. Menambahkan kategori secara manual

Mendapatkan halaman dengan kategori Orde Baru

In [14]:
def process_article(title, text, timestamp, kategoris = ['Orde Baru', 'Tokoh Orde Baru']):
    wikicode = mwparserfromhell.parse(text)
    wikilinks = wikicode.filter_wikilinks()

    if any(f"[[Kategori:{kategori}]]" in wikilinks for kategori in kategoris):
        templates = wikicode.filter_templates(matches="Infobox|Kotak info")
        properties = None

        if len(templates) >= 1:
            properties = {param.name.strip_code().strip(): param.value.strip_code().strip()
                          for param in templates[0].params
                          if param.value.strip_code().strip()}
        return (title, text, timestamp, properties)
    return None

class OrdeBaruHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []
        self._infobox_pages = []
        self._infoboxless_pages = []
        self._article_count = 0

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        elif name == 'page':
            self._article_count += 1
            page = process_article(**self._values)
            if page:
                # print(page[0], end=" ")
                if not page[-1]:
                    # print("Tidak ada infobox")
                    self._infoboxless_pages.append(page)
                else:
                    # print("Ada infobox")
                    self._infobox_pages.append(page)
                self._pages.append(page)

In [15]:
obhandler = OrdeBaruHandler()

obparser = xml.sax.make_parser()
obparser.setContentHandler(obhandler)

In [16]:
start = timer()
for i, line in enumerate(tqdm(bz2.BZ2File(data_path, 'r'))):
    obparser.feed(line)
end = timer()

print(f'Waktu run: {round(end - start)} detik')

85572820it [2:18:48, 10275.01it/s] 

Waktu run: 8328 detik





In [17]:
print(f'Jumlah artikel: {obhandler._article_count}')

Jumlah artikel: 1660717


In [20]:
print(f'Artikel tanpa infobox:')
for page in obhandler._infoboxless_pages[:10]:
    print(page)
    
with open('infoboxless_pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Title", "Text", "Timestamp", "Properties"])
    
    for row in obhandler._infoboxless_pages:
        writer.writerow(row)

Artikel tanpa infobox:
('Kelompencapir', "'''Kelompencapir''', yang merupakan singkatan dari '''Kelompok Pendengar, Pembaca, dan Pemirsa''', adalah kegiatan pertemuan untuk [[petani]] dan [[nelayan]] di [[Indonesia]] yang dicetuskan pada masa pemerintahan [[Suharto|Presiden Suharto]]. Kegiatan ini mengikutkan petani-petani berprestasi dari berbagai daerah. Mereka diadu kepintaran dan pengetahuannya seputar pertanian, antara lain soal cara bertanam yang baik dan pengetahuan tentang pupuk dengan model mirip [[cerdas cermat]]. \n \n Program ini ikut andil kala Indonesia mencapai swasembada pangan dan mendapatkan penghargaan dari [[FAO]] pada tahun [[1984]]. \n \n == Referensi == \n {{reflist}} \n \n == Pranala luar == \n * [http://kominfo.jatimprov.go.id/watchmn/130 Kelompok Informasi Masyarakat (KIM) - Kominfo Jatim] {{Webarchive|url=https://web.archive.org/web/20120808021828/http://kominfo.jatimprov.go.id/watchmn/130 |date=2012-08-08 }} \n \n [[Kategori:Komunikasi massa]] \n [[Kategori:

In [21]:
print("Artikel dengan infobox:")
for page in obhandler._infobox_pages[:10]:
    print(page)
    
with open('infobox_pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Title", "Text", "Timestamp", "Properties"])
    
    for row in obhandler._infobox_pages:
        writer.writerow(row)

Artikel dengan infobox:
('Hamengkubuwana IX', '{{Infobox officeholder \n | honorific_prefix    = Ingkang Sinuwun Sri Sultan \n | name                = Hamengkubuwana IX < br > {{jav|ꦲꦩꦼꦁꦏꦸꦨꦸꦮꦤ꧇꧙꧇}} \n | image               = Hamengkubawono IX Official Portrait.jpg \n | caption             = Sri Sultan Hamengkubuwana IX \n | office              = Sultan Yogyakarta \n | order               = ke-9 \n | term_start          = 18 Maret 1940 \n | term_end            = 2 Oktober 1988 \n | term_label          = Mulai bertakhta \n | predecessor         = [[Hamengkubuwana VIII]] \n | successor           = [[Hamengkubuwana X]] \n | office2             = Wakil Presiden Indonesia \n | order2              = ke-2 \n | term_start2         = 23 Maret 1973 \n | term_end2           = 23 Maret 1978 \n | president2          = [[Soeharto]] \n | predecessor2        = [[Mohammad Hatta]] \n | successor2          = [[Adam Malik]] \n | office3             = Daftar Menteri Koordinator Bidang Perekonomian Indonesia