In [7]:
import time
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from pymongo import MongoClient
import json

# Load pre-trained BERT model and tokenizer for Indonesian
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
model = BertModel.from_pretrained('indobenchmark/indobert-base-p1')

# Example documents in Bahasa Indonesia
# Connect to MongoDB
client = MongoClient('mongodb://root:admin123%23@localhost:27017/?authMechanism=SCRAM-SHA-1&authSource=admin')
db = client['kpu']
collection_dataset_caleg_training = db['col_dataset_caleg_text']

# Retrieve data from MongoDB collection
data_collection = list(collection_dataset_caleg_training.find())

# Convert ObjectId to string
for data in data_collection:
    data['_id'] = str(data['_id'])
    data['original_id'] = str(data['original_id'])

# Print the modified data_collection
documents = json.loads(json.dumps(data_collection))

def min_max_normalize(scores):
    min_score = min(scores)
    max_score = max(scores)
    normalized_scores = [(score - min_score) / (max_score - min_score) for score in scores]
    return normalized_scores


# Function to preprocess documents for BM25
def preprocess_documents(documents):
    tokenized_docs = [tokenizer.tokenize(doc['_id'] + ' ' + doc['text'].lower()) for doc in documents]
    return tokenized_docs

# Preprocess documents for BM25
start_time = time.time()
tokenized_docs = preprocess_documents(documents)
bm25 = BM25Okapi(tokenized_docs)
preprocess_time = time.time() - start_time

# Function to perform search using BM25
def bm25_search(query, top_n=5):
    tokenized_query = tokenizer.tokenize(query.lower())
    doc_scores = min_max_normalize(bm25.get_scores(tokenized_query))
    top_doc_indices = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]
    return [(documents[i]['_id'], documents[i]['text'], doc_scores[i]) for i in top_doc_indices]

# Example query
query = "demokrat"

# Perform BM25 search
start_time = time.time()
bm25_search_results = bm25_search(query)
search_time = time.time() - start_time

# Display BM25 search results
if bm25_search_results:
    print("BM25 Search Results:")
    for i, (doc_id, doc_text, score) in enumerate(bm25_search_results, start=1):
        print(f"{i}. Document ID: {doc_id} - Text: {doc_text} - Score: {score:.4f}")
else:
    print("No relevant documents found.")

print("Preprocessing Time:", preprocess_time)
print("Search Time:", search_time)

BM25 Search Results:
1. Document ID: 65f4839455c3c7ebac0ca104 - Text: daftar pilihan calon legislatif dewan perwakilan daerah kabupaten kota karimun satu dengan nomor urut partai lima partai nasional demokrat nomor urut calon enam heti kartika jenis kelamin perempuan calon legislatif tahun 2019 - Score: 1.0000
2. Document ID: 65f4839155c3c7ebac0c95db - Text: daftar pilihan calon legislatif dewan perwakilan daerah kabupaten kota bengkulu utara satu dengan nomor urut partai empat belas partai demokrat nomor urut calon tiga mulyani mulyani jenis kelamin perempuan calon legislatif tahun 2019 - Score: 0.9913
3. Document ID: 65f4839155c3c7ebac0c97bb - Text: daftar pilihan calon legislatif dewan perwakilan daerah kabupaten kota kaur tiga dengan nomor urut partai lima partai nasional demokrat nomor urut calon tiga nita ria jenis kelamin perempuan calon legislatif tahun 2019 - Score: 0.9913
4. Document ID: 65f4839455c3c7ebac0ca06a - Text: daftar pilihan calon legislatif dewan perwakilan daerah 

In [17]:
import time
from pymongo import MongoClient
import json
from rank_bm25 import BM25Okapi
from transformers import BertTokenizer

# Connect to MongoDB
client = MongoClient('mongodb://root:admin123%23@localhost:27017/?authMechanism=SCRAM-SHA-1&authSource=admin')
db = client['kpu']
collection_dataset_caleg_training = db['col_dataset_caleg_text']

# Retrieve data from MongoDB collection
data_collection = list(collection_dataset_caleg_training.find())

# Convert ObjectId to string
for data in data_collection:
    data['_id'] = str(data['_id'])
    data['original_id'] = str(data['original_id'])

# Print the modified data_collection
documents = json.loads(json.dumps(data_collection))

# Load pre-trained BERT tokenizer for Indonesian
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

# Function to tokenize documents
def tokenize_documents(documents):
    tokenized_docs = [tokenizer.tokenize(doc['_id'] + ' ' + doc['text'].lower()) for doc in documents]
    return tokenized_docs

# Tokenize documents
start_time = time.time()
tokenized_docs = tokenize_documents(documents)
tokenization_time = time.time() - start_time

In [18]:
# Function to create BM25 index
def create_bm25_index(tokenized_docs):
    return BM25Okapi(tokenized_docs)

# Create BM25 index
start_time = time.time()
bm25 = create_bm25_index(tokenized_docs)
indexing_time = time.time() - start_time

# Function to preprocess query
def preprocess_query(query):
    return tokenizer.tokenize(query.lower())

# Function to perform search using BM25
def bm25_search(query, bm25_index, documents, top_n=5):
    tokenized_query = preprocess_query(query)
    doc_scores = bm25_index.get_scores(tokenized_query)
    
    # Normalize scores using min-max scaling
    min_score = min(doc_scores)
    max_score = max(doc_scores)
    normalized_scores = [(score - min_score) / (max_score - min_score) for score in doc_scores]
    
    top_doc_indices = sorted(range(len(normalized_scores)), key=lambda i: normalized_scores[i], reverse=True)[:top_n]
    return [(documents[i]['_id'], documents[i]['text'], normalized_scores[i]) for i in top_doc_indices]

# Example query
query = "yanto maryanti"

# Perform BM25 search
start_time = time.time()
bm25_search_results = bm25_search(query, bm25, documents)
search_time = time.time() - start_time

# Display BM25 search results
if bm25_search_results:
    print("BM25 Search Results:")
    for i, (doc_id, doc_text, score) in enumerate(bm25_search_results, start=1):
        print(f"{i}. Document ID: {doc_id} - Text: {doc_text} - Score: {score:.4f}")
else:
    print("No relevant documents found.")

print("Tokenization Time:", tokenization_time)
print("Indexing Time:", indexing_time)
print("Search Time:", search_time)

BM25 Search Results:
1. Document ID: 65f4839455c3c7ebac0ca385 - Text: daftar pilihan calon legislatif dewan perwakilan daerah kabupaten kota lingga tiga dengan nomor urut partai satu partai kebangkitan bangsa nomor urut calon enam maryanti maryanti jenis kelamin perempuan calon legislatif tahun 2019 - Score: 1.0000
2. Document ID: 65f4839755c3c7ebac0cad77 - Text: daftar pilihan calon legislatif dewan perwakilan daerah kabupaten kota muara enim dua dengan nomor urut partai lima partai nasional demokrat nomor urut calon tiga maryanti maryanti jenis kelamin perempuan calon legislatif tahun 2019 - Score: 0.9821
3. Document ID: 65f4839155c3c7ebac0c95fa - Text: daftar pilihan calon legislatif dewan perwakilan daerah kabupaten kota bengkulu utara dua dengan nomor urut partai empat golongan karya nomor urut calon satu yanto yanto jenis kelamin laki-laki calon legislatif tahun 2019 - Score: 0.9000
4. Document ID: 65f4839255c3c7ebac0c9bc1 - Text: daftar pilihan calon legislatif dewan perwakilan 