# Category-Based Indexing of idwiki Dump Pages

## Kelas dan fungsi yang diperlukan

Berikut adalah kelas dan fungsi yang digunakan untuk melakukan *indexing*. Kode ini diambil dengan beberapa modifikasi dari Tugas Pemrograman mata kuliah Perolehan Informasi (IR) dan *repository* [`wikipedia-data-science`](https://github.com/WillKoehrsen/wikipedia-data-science/blob/master/notebooks/Downloading%20and%20Parsing%20Wikipedia%20Articles.ipynb).

In [4]:
class IdMap:
    def __init__(self):
        self.str_to_id = {}
        self.id_to_str = []

    def __len__(self):
        return len(self.id_to_str)

    def __get_id(self, s):
        if s not in self.str_to_id.keys():
            self.str_to_id[s] = len(self.id_to_str)
            self.id_to_str.append(s)

        return self.str_to_id[s]

    def __get_str(self, i):
        return self.id_to_str[i]

    def __getitem__(self, key):
        if isinstance(key, int):
            return self.__get_str(key)
        elif isinstance(key, str):
            return self.__get_id(key)

        return None


def sort_union_list(list_A, list_B):
    res = []
    p1 = p2 = 0
    max1 = len(list_A)
    max2 = len(list_B)

    while p1 < max1 and p2 < max2:
        if list_A[p1] == list_B[p2]:
            res.append(list_A[p1])
            p1 += 1
            p2 += 1
        elif list_A[p1] < list_B[p2]:
            res.append(list_A[p1])
            p1 += 1
        else:
            res.append(list_B[p2])
            p2 += 1
            
    while p1 < max1:
        res.append(list_A[p1])
        p1 += 1

    while p2 < max2:
        res.append(list_B[p2])
        p2 += 1

    return res

In [5]:
import array

class VBEPostings:

    @staticmethod
    def encode(postings_list):
        if not postings_list:
            return array.array('B', []).tobytes()

        gap_based = [postings_list[0]]
        for i in range(1, len(postings_list)):
            gap_based.append(postings_list[i] - postings_list[i - 1])

        return VBEPostings.vb_encode(gap_based)

    @staticmethod
    def vb_encode(list_of_numbers):
        bytestream = array.array('B')
        for number in list_of_numbers:
            byte = VBEPostings.vb_encode_number(number)
            bytestream.extend(byte)

        return bytestream.tobytes()

    @staticmethod
    def vb_encode_number(number):
        byte = []
        while True:
            byte.insert(0, number % 128)
            if number < 128:
                break
            number = number // 128
        byte[-1] += 128

        return array.array('B', byte).tobytes()

    @staticmethod
    def decode(encoded_postings_list):
        if not encoded_postings_list:
            return []

        decoded = VBEPostings.vb_decode(encoded_postings_list)
        postings_list = [decoded[0]]
        for i in range(1, len(decoded)):
            postings_list.append(decoded[i] + postings_list[i - 1])

        return postings_list

    @staticmethod
    def vb_decode(encoded_bytestream):
        numbers = []
        n = 0
        for byte in encoded_bytestream:
            if byte < 128:
                n = 128 * n + byte
            else:
                n = 128 * n + byte - 128
                numbers.append(n)
                n = 0
        return numbers

In [6]:
import pickle
import os

class InvertedIndex:
    def __init__(self, index_name, encoding_method, path=''):
        self.encoding_method = encoding_method
        self.path = path

        self.index_file_path = os.path.join(path, index_name + '.index')
        self.metadata_file_path = os.path.join(path, index_name + '.dict')

        self.postings_dict = {}
        self.cats = []  # Untuk keep track urutan cat yang dimasukkan ke index

    def __enter__(self):
        # Membuka index file
        self.index_file = open(self.index_file_path, 'rb+')

        # Kita muat postings dict dan cats iterator dari file metadata
        with open(self.metadata_file_path, 'rb') as f:
            self.postings_dict, self.cats = pickle.load(f)
            self.cat_iter = self.cats.__iter__()

        return self

    def __exit__(self, exception_type, exception_value, traceback):
        # Menutup index file
        self.index_file.close()

        # Menyimpan metadata (postings dict dan cats) ke file metadata dengan bantuan pickle
        with open(self.metadata_file_path, 'wb') as f:
            pickle.dump([self.postings_dict, self.cats], f)


class InvertedIndexReader(InvertedIndex):

    def __iter__(self):
        return self

    def reset(self):
        self.index_file.seek(0)
        self.cat_iter = self.cats.__iter__()  # reset cat iterator

    def __next__(self):
        try:
            cat = next(self.cat_iter)
            return cat, self.get_postings_list(cat)
        except Exception:
            self.reset()
            raise StopIteration

    def get_postings_list(self, cat):
        start_position_in_index_file, _, length_in_bytes_of_postings_list = self.postings_dict[cat]

        self.index_file.seek(start_position_in_index_file)
        encoded_posting_lists = self.index_file.read(length_in_bytes_of_postings_list)
        posting_lists = self.encoding_method.decode(encoded_posting_lists)

        return posting_lists


class InvertedIndexWriter(InvertedIndex):
    def __enter__(self):
        self.index_file = open(self.index_file_path, 'wb+')
        return self

    def append(self, cat, postings_list):
        encoded_posting_lists = self.encoding_method.encode(postings_list)

        self.index_file.seek(0, 2) # end of file
        start_position_in_index_file = self.index_file.tell()
        number_of_postings_in_list = len(postings_list)
        length_in_bytes_of_postings_list = len(encoded_posting_lists)

        self.postings_dict[cat] = (
            start_position_in_index_file,
            number_of_postings_in_list,
            length_in_bytes_of_postings_list
        )

        self.index_file.write(encoded_posting_lists)
        self.cats.append(cat)

In [7]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

## Memindahkan seluruh page ke CSV

Untuk memudahkan proses *indexing*, seluruh halaman ditulis ulang ke dalam file csv dengan kode sebagai berikut.

In [1]:
import bz2

data_path = "idwiki-20240201-pages-articles.xml.bz2"
data_path

'idwiki-20240201-pages-articles.xml.bz2'

In [6]:
handler = WikiXmlHandler()

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for line in bz2.BZ2File(data_path, 'r'):
    parser.feed(line)

In [7]:
import csv

with open('pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["ID", "Title", "Text"])
    
    for i, row in enumerate(handler._pages):
        writer.writerow([i] + list(row))

## Mendapatkan kategori untuk setiap halaman

Untuk memperoleh informasi kategori yang dimiliki setiap halaman, dimanfaatkan `wikilinks` yang memiliki format `[[Kategori:...]]`. Dengan menggunakan *library* `mwparserfromhell`, informasi kategori dapat di-*parsing* dengan kode sebagai berikut.

In [16]:

import csv
import mwparserfromhell

In [17]:
def get_kategoris(row):
    wiki = mwparserfromhell.parse(row[2])
    wikilinks = wiki.filter_wikilinks(matches="\[\[Kategori:")
    kategoris_links = [kat.lower().split('kategori:')[1].split(']')[0] for kat in wikilinks]
    
    kategoris = [kat.strip() for kategori in kategoris_links for kat in kategori.split('|') if kat.strip() != '']
    
    return kategoris

In [18]:
with open('pages.csv', 'r', encoding='UTF-8') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    for row in reader:
        print(get_kategoris(row))
        break 

['genetika molekular', 'nukleat asam dna', 'asam nukleat']


## Melakukan indexing

Untuk melakukan *indexing*, digunakan kelas BSBIIndex yang diambil dari Tugas Pemrograman mata kuliah Perolehan Informasi (IR) dengan beberapa modifikasi. 

In [2]:
import os
import pickle
from tqdm import tqdm


class BSBIIndex:
    def __init__(self, data_path, output_path, postings_encoding, index_name="main_index"):
        self.cat_id_map = IdMap()
        self.page_id_map = IdMap()
        self.data_path = data_path
        self.output_path = output_path
        self.index_name = index_name
        self.postings_encoding = postings_encoding

    def save(self):
        with open('categories.csv', 'w', encoding='UTF-8') as f:
            writer = csv.writer(f)
            for key, value in enumerate(self.cat_id_map.id_to_str):
                writer.writerow([key, value])
        
        with open(os.path.join(self.output_path, 'cats.dict'), 'wb') as f:
            pickle.dump(self.cat_id_map, f)
        with open(os.path.join(self.output_path, 'pages.dict'), 'wb') as f:
            pickle.dump(self.page_id_map, f)

    def load(self):
        """Memuat page_id_map and cat_id_map dari output directory"""

        with open(os.path.join(self.output_path, 'cats.dict'), 'rb') as f:
            self.cat_id_map = pickle.load(f)
        with open(os.path.join(self.output_path, 'pages.dict'), 'rb') as f:
            self.page_id_map = pickle.load(f)

    def start_indexing(self):
        cd_pairs = self.parsing_pages()
        
        with InvertedIndexWriter(self.index_name, self.postings_encoding, path=self.output_path) as main_index:
            self.write_to_index(cd_pairs, main_index)
        
        self.save()

    def parsing_pages(self):
        res = []
        with open(self.data_path, 'r', newline='', encoding='UTF-8') as file:
            reader = csv.reader(file)
            
            next(reader)
            
            for row in tqdm(reader, desc="Parsing pages"):
                page_id = self.page_id_map[row[0]]
                kategoris = get_kategoris(row)
                for kategori in kategoris:
                    res.append((self.cat_id_map[kategori], page_id))

        return res

    def write_to_index(self, cd_pairs, index):
        cat_dict = {}
        for cat_id, page_id in tqdm(cd_pairs, desc="Writing to index"):
            if cat_id not in cat_dict:
                cat_dict[cat_id] = set()
            cat_dict[cat_id].add(page_id)
        for cat_id in sorted(cat_dict.keys()):
            index.append(cat_id, sorted(list(cat_dict[cat_id])))

    def boolean_retrieve(self, kategoris):
        self.load()
        res = []

        with InvertedIndexReader(self.index_name, self.postings_encoding, path=self.output_path) as indices:
            for kategori in kategoris:
                kategori = kategori.lower()
                if kategori not in self.cat_id_map:
                    continue

                cat_id = self.cat_id_map[kategori]

                if not res:
                    res = indices.get_postings_list(cat_id)
                else:
                    res = sort_union_list(res, indices.get_postings_list(cat_id))
                    
        return [self.page_id_map[page_id] for page_id in res]

In [12]:
import csv
import sys

maxInt = sys.maxsize

while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

In [13]:
BSBI_instance = BSBIIndex(data_path='pages.csv', \
                               postings_encoding=VBEPostings,
                               output_path='index')

BSBI_instance.start_indexing()

Parsing pages: 1660717it [2:02:56, 225.13it/s] 
Writing to index: 100%|██████████| 1677886/1677886 [00:03<00:00, 430748.19it/s]


In [19]:
with open('pages.csv', 'r', encoding='UTF-8') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    counter = 0
    
    for row in reader:
        print(get_kategoris(row))
        counter += 1
        
        if counter > 10:
            break

['genetika molekular', 'nukleat asam dna', 'asam nukleat']
[]
['presiden mesir', 'perdana menteri mesir', 'pemenang hadiah nobel perdamaian', 'kepala negara yang dibunuh', 'person of the year', 'pemimpin perang dingin', 'tokoh militer mesir', 'kolonel']
['tokoh yang tidak memiliki informasi tahun kelahiran']
['arkeologi']
['antropologi']
['ilmu komputer']
['bahasa indonesia', 'bahasa di indonesia', 'bahasa di asia', 'bahasa di timor leste', 'bahasa di asia tenggara', 'bahasa berpola subjek–predikat–objek', 'rumpun bahasa austronesia', 'rumpun bahasa melayik']
['biologi']
['bali', 'provinsi di indonesia', 'pulau di indonesia', 'kepulauan sunda kecil', 'kepulauan sunda', 'pendirian tahun 1958 di indonesia', 'negara dan wilayah yang didirikan tahun 1958']
['rumpun bahasa austronesia', 'bahasa di indonesia', 'bahasa yang mempunyai aksara tersendiri']


## Meretrieve halaman berdasarkan kategori

Setelah dilakukan *indexing*, kita dapat mencari halaman-halaman dengan kategori spesifik dengan memasukkannya pada *list* `kategoris` pada kode sebagai berikut.

In [10]:
BSBI_instance = BSBIIndex(data_path='pages.csv', \
                               postings_encoding=VBEPostings,
                               output_path='index')

kategoris = ['orde baru']
results = []

for page in BSBI_instance.boolean_retrieve(kategoris):
    results.append(page)

Selanjutnya, untuk setiap halaman yang diperoleh, akan di-*parsing* untuk mendapatkan *infobox* jika ada.

In [21]:
def process_article(title, text):
    wikicode = mwparserfromhell.parse(text)
    templates = wikicode.filter_templates(matches="Infobox|Kotak info")
    properties = None

    if len(templates) >= 1:
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip()
                        for param in templates[0].params
                        if param.value.strip_code().strip()}
    return (title, properties)

In [41]:
infoboxless_orde_barus = []
infobox_orde_barus = []

with open('pages.csv', 'r', encoding='UTF-8', newline='') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    max_id = max(list(map(int, results)))
    
    for row in reader:
        if int(row[0]) > max_id:
            break
        
        if row[0] in results:
            data = process_article(row[1], row[2])
            if data[1] is None:
                infoboxless_orde_barus.append(data)
            else:
                infobox_orde_barus.append(data)

Hasil *parsing* kemudian dapat ditulis ke file csv dengan memisahkan halaman yang memiliki *infobox* dan yang tidak.

In [42]:
with open('infoboxless_orde_baru_pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Title"])
    
    for data in infoboxless_orde_barus:
        writer.writerow([data[0]])

In [40]:
with open('infobox_orde_baru_pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Title", "Properties"])
    
    for data in infobox_orde_barus:
        writer.writerow([data[0], data[1]])

### Melakukan hal yang sama untuk kategori `Tokoh Orde Baru`

In [39]:
kategoris = ['tokoh orde baru']
results = []

for page in BSBI_instance.boolean_retrieve(kategoris):
    results.append(page)
    
infoboxless_tokoh_orde_barus = []
infobox_tokoh_orde_barus = []

with open('pages.csv', 'r', encoding='UTF-8', newline='') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    max_id = max(list(map(int, results)))
    
    for row in reader:
        if int(row[0]) > max_id:
            break
        
        if row[0] in results:
            data = process_article(row[1], row[2])
            if data[1] is None:
                infoboxless_tokoh_orde_barus.append(data)
            else:
                infobox_tokoh_orde_barus.append(data)
                
with open('infoboxless_tokoh_orde_baru_pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Title"])
    
    for data in infoboxless_tokoh_orde_barus:
        writer.writerow([data[0]])
        
with open('infobox_tokoh_orde_baru_pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Title", "Properties"])
    
    for data in infobox_tokoh_orde_barus:
        writer.writerow([data[0], data[1]])

### Meretrieve halaman berdasarkan kategori `Indonesia dalam tahun ...`

In [43]:
kategoris = [f'indonesia dalam tahun {tahun}' for tahun in range(1966,1999)]
results = []

for page in BSBI_instance.boolean_retrieve(kategoris):
    results.append(page)
    
data_orde_baru = []

with open('pages.csv', 'r', encoding='UTF-8', newline='') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    max_id = max(list(map(int, results)))
    
    for row in reader:
        if int(row[0]) > max_id:
            break
        
        if row[0] in results:
            data = process_article(row[1], row[2])
            data_orde_baru.append(data)
                
with open('data_orde_baru_pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["Title", "Properties"])
    
    for data in data_orde_baru:
        writer.writerow([data[0], data[1]])