# Indeks dan Pencarian 

In [11]:
import os
import re
import pickle

class TreeNode:
    def __init__(self, term):
        self.term = term
        self.files = set()
        self.left = None
        self.right = None

def insert(root, term, file_path):
    if root is None:
        node = TreeNode(term)
        node.files.add(file_path)
        return node

    if term < root.term:
        root.left = insert(root.left, term, file_path)
    elif term > root.term:
        root.right = insert(root.right, term, file_path)
    else:
        root.files.add(file_path)

    return root

def build_inverted_index(root, folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read().lower()
                terms = re.findall(r'\b\w+\b', content)
                for term in terms:
                    root = insert(root, term, file_path)

    return root

def save_inverted_index(root, output_file):
    with open(output_file, 'wb') as file:
        pickle.dump(root, file)

def build_and_save_combined_index(txts_folder, output_file):
    root = None
    for category in os.listdir(txts_folder):
        category_folder = os.path.join(txts_folder, category, 'all')
        if os.path.isdir(category_folder):
            root = build_inverted_index(root, category_folder)

    save_inverted_index(root, output_file)
    return root

def search_term(root, term):
    node = root
    while node:
        if term < node.term:
            node = node.left
        elif term > node.term:
            node = node.right
        else:
            return node.files
    return set()

# txt folder dan pkl output:
txts_folder = 'txts'
output_file = 'inverted_index.pkl'

# Membangun dan menyimpan inverted index untuk semua kategori
root = build_and_save_combined_index(txts_folder, output_file)



In [12]:
# Loaded pickle data
import pickle

def display_results(root, term):
    print(f'keyword: "{term}"')
    file_paths = search_term(root, term)

    category_counts = {}
    if not file_paths:
        print(f'Term "{term}" not found.')
        return 0
    else:
        print(f'Keyword {term} terdapat pada dokumen:')
        for file_path in file_paths:
            category = os.path.basename(os.path.dirname(os.path.dirname(file_path)))
            if category in category_counts:
                category_counts[category] += 1
            else:
                category_counts[category] = 1
            with open(file_path, 'r', encoding='utf-8') as file:
                title = file.readline().strip()

            # Membagi path menjadi bagian-bagian menggunakan '/'
            path_parts = file_path.split('/')
            print(f"{path_parts[-1]} : {title}")
    
    most_common_category = max(category_counts, key=category_counts.get) # type: ignore
    result_dict = {most_common_category: category_counts[most_common_category]}
    print(f"keyword '{term}' paling banyak ditemukan sebanyak {category_counts[most_common_category]} di kategori '{most_common_category}'")


def load_inverted_index(file_path):
    with open(file_path, 'rb') as file:
        inverted_index = pickle.load(file)
    return inverted_index

# Contoh penggunaan:
input_file = 'inverted_index.pkl'  # Ganti dengan nama file yang sesuai
loaded_index = load_inverted_index(input_file)


In [13]:
# Search term
searched_term = ['puasa', 'makan', 'jokowi', 'warna', 'bola', 'sepak']

for i in searched_term:
    display_results(loaded_index, i)
    print('=========\n')
    

keyword: "puasa"
Keyword puasa terdapat pada dokumen:
45.txt : Wajibkah Bayar Zakat Fitrah Jika Seseorang Meninggal Dunia di Bulan Ramadhan?
11.txt : Juara BAC 2023, Anthony Ginting Dapat Ucapan Selamat dari Jokowi
42.txt : Anthony Ginting Akhiri Puasa Gelar 16 Tahun, Indonesia Bantai Singapura
66.txt : Makna 'Minal Aidin wal Faizin' Ternyata Kurang Tepat Diucapkan saat Lebaran
58.txt : Mengerikan, Sang Juara Dunia hingga 2 Tunggal Putra Jepang Jadi Korban Ginting
93.txt : Ganas, Ginting Pecundangi Sederet Tunggal Putra di Ranking Dunia
31.txt : Khutbah Jumat: Sempurnakan Puasa Ramadhan dengan Puasa Syawal
171.txt : Waktu, Niat dan Keutamaan Puasa Syawal 2023
224.txt : Bayar Utang Puasa Ramadhan atau Puasa Syawal, Mana yang Lebih Dulu?
6.txt : Anthony Sinisuka Ginting Ungkap Kunci Sukses Jadi Juara Asia
257.txt : Senam Buktikan Mampu Raih Prestasi SEA Games di Tengah Keterbatasan
516.txt : Fakta Mengerikan Tim Indonesia di SEA Games 2023: Banyak Sejarah Tercipta
86.txt : Khutbah Jumat: