# Class dan function

In [1]:
class IdMap:
    def __init__(self):
        self.str_to_id = {}
        self.id_to_str = []

    def __len__(self):
        # TODO
        return len(self.id_to_str)

    def __get_id(self, s):
        # TODO
        if s not in self.str_to_id.keys():
            self.str_to_id[s] = len(self.id_to_str)
            self.id_to_str.append(s)

        return self.str_to_id[s]

    def __get_str(self, i):
        # TODO
        return self.id_to_str[i]

    def __getitem__(self, key):
        # TODO
        if isinstance(key, int):
            return self.__get_str(key)
        elif isinstance(key, str):
            return self.__get_id(key)

        return None


def sort_union_list(list_A, list_B):
    res = []
    p1 = p2 = 0
    max1 = len(list_A)
    max2 = len(list_B)

    while p1 < max1 and p2 < max2:
        if list_A[p1] == list_B[p2]:
            res.append(list_A[p1])
            p1 += 1
            p2 += 1
        elif list_A[p1] < list_B[p2]:
            res.append(list_A[p1])
            p1 += 1
        else:
            res.append(list_B[p2])
            p2 += 1
            
    while p1 < max1:
        res.append(list_A[p1])
        p1 += 1

    while p2 < max2:
        res.append(list_B[p2])
        p2 += 1

    return res

In [2]:
import array

class VBEPostings:

    @staticmethod
    def encode(postings_list):
        # TODO

        if not postings_list:
            return array.array('B', []).tobytes()

        gap_based = [postings_list[0]]
        for i in range(1, len(postings_list)):
            gap_based.append(postings_list[i] - postings_list[i - 1])

        # print(gap_based)

        return VBEPostings.vb_encode(gap_based)

    @staticmethod
    def vb_encode(list_of_numbers):
        # TODO
        bytestream = array.array('B')
        for number in list_of_numbers:
            byte = VBEPostings.vb_encode_number(number)
            bytestream.extend(byte)

        # print(bytestream)

        return bytestream.tobytes()

    @staticmethod
    def vb_encode_number(number):
        # TODO
        byte = []
        while True:
            byte.insert(0, number % 128)
            if number < 128:
                break
            number = number // 128
        byte[-1] += 128

        # print(byte)

        return array.array('B', byte).tobytes()

    @staticmethod
    def decode(encoded_postings_list):
        # TODO
        if not encoded_postings_list:
            return []

        decoded = VBEPostings.vb_decode(encoded_postings_list)
        postings_list = [decoded[0]]
        for i in range(1, len(decoded)):
            postings_list.append(decoded[i] + postings_list[i - 1])

        return postings_list

    @staticmethod
    def vb_decode(encoded_bytestream):
        """
        Decoding sebuah bytestream yang sebelumnya di-encode dengan
        variable-byte encoding.
        """
        numbers = []
        n = 0
        for byte in encoded_bytestream:
            if byte < 128:
                n = 128 * n + byte
            else:
                n = 128 * n + byte - 128
                numbers.append(n)
                n = 0
        return numbers

In [3]:
import pickle
import os

class InvertedIndex:
    def __init__(self, index_name, encoding_method, path=''):
        self.encoding_method = encoding_method
        self.path = path

        self.index_file_path = os.path.join(path, index_name + '.index')
        self.metadata_file_path = os.path.join(path, index_name + '.dict')

        self.postings_dict = {}
        self.cats = []  # Untuk keep track urutan cat yang dimasukkan ke index

    def __enter__(self):
        # Membuka index file
        self.index_file = open(self.index_file_path, 'rb+')

        # Kita muat postings dict dan cats iterator dari file metadata
        with open(self.metadata_file_path, 'rb') as f:
            self.postings_dict, self.cats = pickle.load(f)
            self.cat_iter = self.cats.__iter__()

        return self

    def __exit__(self, exception_type, exception_value, traceback):
        # Menutup index file
        self.index_file.close()

        # Menyimpan metadata (postings dict dan cats) ke file metadata dengan bantuan pickle
        with open(self.metadata_file_path, 'wb') as f:
            pickle.dump([self.postings_dict, self.cats], f)


class InvertedIndexReader(InvertedIndex):

    def __iter__(self):
        return self

    def reset(self):
        self.index_file.seek(0)
        self.cat_iter = self.cats.__iter__()  # reset cat iterator

    def __next__(self):
        # TODO
        try:
            cat = next(self.cat_iter)
            return cat, self.get_postings_list(cat)
        except Exception:
            self.reset()
            raise StopIteration

    def get_postings_list(self, cat):
        # TODO
        start_position_in_index_file, _, length_in_bytes_of_postings_list = self.postings_dict[cat]

        self.index_file.seek(start_position_in_index_file)
        encoded_posting_lists = self.index_file.read(length_in_bytes_of_postings_list)
        posting_lists = self.encoding_method.decode(encoded_posting_lists)

        return posting_lists


class InvertedIndexWriter(InvertedIndex):
    def __enter__(self):
        self.index_file = open(self.index_file_path, 'wb+')
        return self

    def append(self, cat, postings_list):
        # TODO
        encoded_posting_lists = self.encoding_method.encode(postings_list)

        self.index_file.seek(0, 2) # end of file
        start_position_in_index_file = self.index_file.tell()
        number_of_postings_in_list = len(postings_list)
        length_in_bytes_of_postings_list = len(encoded_posting_lists)

        self.postings_dict[cat] = (
            start_position_in_index_file,
            number_of_postings_in_list,
            length_in_bytes_of_postings_list
        )

        self.index_file.write(encoded_posting_lists)
        self.cats.append(cat)

# Memindahkan seluruh page ke CSV

In [4]:
import bz2

data_path = "idwiki-20240201-pages-articles.xml.bz2"
data_path

'idwiki-20240201-pages-articles.xml.bz2'

In [5]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [6]:
handler = WikiXmlHandler()

parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for line in bz2.BZ2File(data_path, 'r'):
    parser.feed(line)

In [7]:
import csv

with open('pages.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(["ID", "Title", "Text"])
    
    for i, row in enumerate(handler._pages):
        writer.writerow([i] + list(row))

# Mendapatkan kategori untuk setiap halaman

In [10]:
import mwparserfromhell

In [8]:
def get_kategoris(row):
    wiki = mwparserfromhell.parse(row[2])
    wikilinks = wiki.filter_wikilinks(matches="\[\[Kategori:")
    # print(wikilinks)
    kategoris_links = [kat.lower().split('kategori:')[1].split(']')[0] for kat in wikilinks]
    
    kategoris = [kat.strip() for kategori in kategoris_links for kat in kategori.split('|') if kat.strip() != '']
    
    return kategoris

In [11]:
with open('pages.csv', 'r', encoding='UTF-8') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    for row in reader:
        print(get_kategoris(row))
        break 

['genetika molekular', 'nukleat asam dna', 'asam nukleat']


# Melakukan indexing

In [6]:
import os
import pickle
import time
from tqdm import tqdm


class BSBIIndex:
    def __init__(self, data_path, output_path, postings_encoding, index_name="main_index"):
        self.cat_id_map = IdMap()
        self.page_id_map = IdMap()
        self.data_path = data_path
        self.output_path = output_path
        self.index_name = index_name
        self.postings_encoding = postings_encoding

        # Untuk menyimpan nama-nama file dari semua incatediate inverted index
        self.incatediate_indices = []

        # Untuk menyimpan data waktu running
        self.log = []

    def save(self):
        with open('categories.csv', 'w', encoding='UTF-8') as f:
            writer = csv.writer(f)
            for key, value in enumerate(self.cat_id_map.id_to_str):
                writer.writerow([key, value])
        
        with open(os.path.join(self.output_path, 'cats.dict'), 'wb') as f:
            pickle.dump(self.cat_id_map, f)
        with open(os.path.join(self.output_path, 'pages.dict'), 'wb') as f:
            pickle.dump(self.page_id_map, f)

    def load(self):
        """Memuat page_id_map and cat_id_map dari output directory"""

        with open(os.path.join(self.output_path, 'cats.dict'), 'rb') as f:
            self.cat_id_map = pickle.load(f)
        with open(os.path.join(self.output_path, 'pages.dict'), 'rb') as f:
            self.page_id_map = pickle.load(f)

    def start_indexing(self):
    # loop untuk setiap sub-directory di dalam folder collection (setiap block)
        start_time = time.time()
        
        cd_pairs = self.parsing_pages()
        # print(f'cd_pairs {cd_pairs}')
        
        with InvertedIndexWriter(self.index_name, self.postings_encoding, path=self.output_path) as main_index:
            self.write_to_index(cd_pairs, main_index)
        
        end_time = time.time()
        self.log.append(f'Total waktu keseluruhan encoding dengan {self.postings_encoding} adalah {round((end_time - start_time) / 60, 8)}')
        self.save()

    def parsing_pages(self):
        res = []
        with open(self.data_path, 'r', newline='', encoding='UTF-8') as file:
            reader = csv.reader(file)
            
            next(reader)
            
            counter = 0
            
            for row in tqdm(reader, desc="Parsing pages"):
                page_id = self.page_id_map[row[0]]
                kategoris = get_kategoris(row)
                for kategori in kategoris:
                    res.append((self.cat_id_map[kategori], page_id))
                    
                # counter += 1
                # if counter > 1e3:
                #     break

        return res

    def write_to_index(self, cd_pairs, index):
        cat_dict = {}
        for cat_id, page_id in tqdm(cd_pairs, desc="Writing to index"):
            if cat_id not in cat_dict:
                cat_dict[cat_id] = set()
            cat_dict[cat_id].add(page_id)
        for cat_id in sorted(cat_dict.keys()):
            index.append(cat_id, sorted(list(cat_dict[cat_id])))

    def boolean_retrieve(self, kategoris):
        # TODO

        self.load()
        res = []

        start_time = time.time()
        with InvertedIndexReader(self.index_name, self.postings_encoding, path=self.output_path) as indices:
            for kategori in kategoris:
                # print(kategori)
                kategori = kategori.lower()
                if kategori not in self.cat_id_map:
                    continue

                cat_id = self.cat_id_map[kategori]
                
                # print(cat_id)
                # print(f'sebelum {res}')

                if not res:
                    res = indices.get_postings_list(cat_id)
                else:
                    res = sort_union_list(res, indices.get_postings_list(cat_id))
                    
                # print(f'setelah {res}')
        end_time = time.time()
        duration = end_time - start_time
        # print(
        #     f'Waktu untuk melakukan query {kategori} dengan encoding {self.postings_encoding} sampai mendapatkan hasil adalah {duration} s')

        return [self.page_id_map[page_id] for page_id in res]

In [5]:
import csv
import sys

maxInt = sys.maxsize

while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

In [13]:
BSBI_instance = BSBIIndex(data_path='pages.csv', \
                               postings_encoding=VBEPostings,
                               output_path='index')

BSBI_instance.start_indexing()

Parsing pages: 1660717it [2:02:56, 225.13it/s] 
Writing to index: 100%|██████████| 1677886/1677886 [00:03<00:00, 430748.19it/s]


In [12]:
with open('pages.csv', 'r', encoding='UTF-8') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    counter = 0
    
    for row in reader:
        print(get_kategoris(row))
        counter += 1
        
        if counter > 10:
            break

['genetika molekular', 'nukleat asam dna', 'asam nukleat']
[]
['presiden mesir', 'perdana menteri mesir', 'pemenang hadiah nobel perdamaian', 'kepala negara yang dibunuh', 'person of the year', 'pemimpin perang dingin', 'tokoh militer mesir', 'kolonel']
['tokoh yang tidak memiliki informasi tahun kelahiran']
['arkeologi']
['antropologi']
['ilmu komputer']
['bahasa indonesia', 'bahasa di indonesia', 'bahasa di asia', 'bahasa di timor leste', 'bahasa di asia tenggara', 'bahasa berpola subjek–predikat–objek', 'rumpun bahasa austronesia', 'rumpun bahasa melayik']
['biologi']
['bali', 'provinsi di indonesia', 'pulau di indonesia', 'kepulauan sunda kecil', 'kepulauan sunda', 'pendirian tahun 1958 di indonesia', 'negara dan wilayah yang didirikan tahun 1958']
['rumpun bahasa austronesia', 'bahasa di indonesia', 'bahasa yang mempunyai aksara tersendiri']


In [13]:
BSBI_instance = BSBIIndex(data_path='pages.csv', \
                               postings_encoding=VBEPostings,
                               output_path='index')

kategoris = ['orde baru', 'tokoh orde baru']
results = []

for page in BSBI_instance.boolean_retrieve(kategoris):
    print(page)
    results.append(page)

136
487
2303
2319
2337
2367
2411
2499
2856
2869
4793
5101
5234
6062
8013
9906
9971
9983
10519
10980
12293
12307
14094
17268
17895
18488
18609
20743
21808
21812
22235
23198
23199
23200
23208
24298
24772
25227
27016
28882
29230
29232
29362
29664
29678
29721
30414
31560
32453
33986
34922
39618
40048
40640
41228
42557
42603
42607
42764
44858
46288
46315
46319
46353
46355
46362
65872
72371
73256
75013
76915
79154
82770
97254
98147
99362
101584
101684
107409
107412
114450
118499
126354
128443
129180
129182
145073
145075
161622
163637
164257
165082
165353
166579
169982
174787
174967
175013
175466
175470
175531
175740
184902
194669
197658
197687
205008
261439
311888
331610
366608
366736
366744
373670
400861
407212
429238
510724
510732
513240
514806
514848
515064
516271
518489
525100
667138
704993
708818
715759
717646
721518
731068
732710
734454
746148
746169
746985
747134
747193
747324
748213
750090
751481
756379
758715
758729
760812
811030
813944
970180
1014514
1015944
1016691
1016907
1021647

In [17]:
with open('pages.csv', 'r', encoding='UTF-8', newline='') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    max_id = max(list(map(int, results)))
    
    for row in reader:
        if int(row[0]) > max_id:
            break
        
        if row[0] in results:
            print(row[1])

Hamengkubuwana IX
Soeharto
Bob Hasan
Siti Hardijanti Rukmana
Siti Hartinah
Komando Operasi Pemulihan Keamanan dan Ketertiban
Partai Golongan Karya
Try Sutrisno
Abdul Haris Nasution
Basuki Rahmat
R. Hartono
Akbar Tanjung
Soedharmono
Wiranto
Gerakan 30 September
Tragedi Trisakti
Orde Baru
Kerusuhan Mei 1998
Harmoko
Penumpasan Pengkhianatan G 30 S PKI
Kelompencapir
Tommy Soeharto
Radius Prawiro
Ali Moertopo
Probosutedjo
Arief Budiman
Sudono Salim
Ibnu Sutowo
Peristiwa 27 Juli
Malari
Pembantaian Santa Cruz
Ismail Saleh
Ali Said
Soegih Arto
Andi Muhammad Ghalib
Soesilo Soedarman
Rencana Pembangunan Lima Tahun
Mochtar Lubis
Transmigrasi
Dorodjatun Kuntjoro-Jakti
Maraden Panggabean
M. Jusuf
Leonardus Benyamin Moerdani
Feisal Tanjung
Syarwan Hamid
Arifin Siregar
Abdul Latief (pengusaha)
Amir Machmud
J. B. Sumarlin
Mafia Berkeley
Penentuan Pendapat Rakyat
Achmad Tahir
Kasus dugaan korupsi Soeharto
De-Soekarnoisasi
Titiek Soeharto
Cosmas Batubara
Alamsyah Ratu Perwiranegara
Bustanil Arifin
Trilo

In [18]:
len(results)

184

In [19]:
total = 0

with open('infobox_pages.csv', 'r', encoding='UTF-8', newline='') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    for row in reader:
        total += 1

total

132

In [20]:
total = 0

with open('infoboxless_pages.csv', 'r', encoding='UTF-8', newline='') as file:
    reader = csv.reader(file)
    
    next(reader)
    
    for row in reader:
        total += 1

total

49