In [15]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import word_tokenize
import json
import re
from pymongo import MongoClient
from bson import ObjectId

def tokenizer(text):
    tokens = word_tokenize(text)
    # Optionally, you can apply further preprocessing steps such as removing punctuation or lowercasing
    tokens = [token.lower() for token in tokens if re.match(r'\b\w+\b', token)]
    return tokens

# Load Corpus data
with open('corpus_data.pkl', 'rb') as f:
    corpus = pickle.load(f)
    
# Load the trained VSM model from the pickle file
with open('vsm_model.pkl', 'rb') as f:
    loaded_vsm_model = pickle.load(f)

# Load the preprocessed corpus vectors
with open('corpus_vectors.pkl', 'rb') as f:
    corpus_vectors = pickle.load(f)

# Example test query
query = 'purnama'

# Preprocess the query to lowercase
query_lower = query.lower()

# Vectorize the query using the loaded model
query_vector = loaded_vsm_model.transform([query_lower])

# Initialize an array to store similarity scores and corresponding original_ids
similarities = cosine_similarity(query_vector, corpus_vectors)

# Find indices of top k most similar documents
k = 100  # Adjust k as needed
top_indices = similarities.argsort()[0][-k:][::-1]

# Retrieve corresponding original_ids and text_data
similar_documents = [(i, corpus[i]['original_id'], corpus[i]['text']) for i in top_indices]

num_results = len(similar_documents)
displayed_results = similar_documents[:num_results]

relevant_docs = len([doc for doc in corpus if query.lower() in doc['text'].lower()])
# # retrieved_relevant_docs = len([r for r, _ in displayed_results if query.lower() in r['text'].lower()])
# # print(retrieved_relevant_docs)
precision = num_results / len(displayed_results) if displayed_results else 0
recall = num_results / relevant_docs if relevant_docs > 0 else 0
print(recall)

# print(displayed_results)

list_doc_id = []

for index, doc_id, text in similar_documents:
    object_data = {
        'document_id': doc_id,
        'score': similarities[0][index]
    }
    
    list_doc_id.append(object_data)

def get_data_from_mongodb_with_similarity(documents):
    # Connect to MongoDB
    client = MongoClient('mongodb://root:admin123%23@localhost:27017/?authMechanism=SCRAM-SHA-1&authSource=admin')
    db = client['kpu']
    collection_dataset_caleg = db['col_dataset_caleg']
    
    # List to store retrieved documents with similarity scores
    documents_with_similarity = []

    # Iterate over each document object
    for doc_obj in documents:
        # Extract document_id and score from the document object
        document_id = ObjectId(doc_obj['document_id'])
        score = doc_obj['score']
        
        # Query MongoDB for document with matching ID
        query = {"_id": document_id}
        result = collection_dataset_caleg.find_one(query)

        
        # If document is found, merge with similarity score
        if result:
            result['score'] = score
            documents_with_similarity.append(result)

    # Close MongoDB connection
    client.close()

    return documents_with_similarity


documents_data = get_data_from_mongodb_with_similarity(list_doc_id)

# Print retrieved documents with similarity scores
# for doc in documents_data:
#     print(doc)

2.2222222222222223
