In [2]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
import json
import re
from pymongo import MongoClient
import pandas as pd

# Download NLTK resources
nltk.download('punkt')

testing_result = pd.read_csv('testing_result.csv')

# Define custom tokenizer to tokenize each character separately
def custom_tokenizer(text):
    return [char for char in text]

def tokenizer(text):
    tokens = word_tokenize(text)
    # Optionally, you can apply further preprocessing steps such as removing punctuation or lowercasing
    tokens = [token.lower() for token in tokens if re.match(r'\b\w+\b', token)]
    return tokens

# Function for calculating average precision for a query
def average_precision(qid, qvector, corpus_vectors):
  
    # Ranking documents for the query
    similarity_scores = cosine_similarity(qvector.reshape(1, -1), corpus_vectors).flatten()
    ranked_indices = np.argsort(similarity_scores)[::-1]

    # Taking Top 10 documents for the evaluation
    ranking = [1 if testing_result.iloc[idx]['rel'] else 0 for idx in ranked_indices[:10]]

    # Calculating precision
    precision = [sum(ranking[:i + 1]) / (i + 1) for i in range(10)]

    # If no relevant document in list then return 0
    if not any(ranking):
        return 0

    return np.mean(precision)

# Connect to MongoDB
client = MongoClient('mongodb://root:admin123%23@localhost:27017/?authMechanism=SCRAM-SHA-1&authSource=admin')
db = client['kpu']
collection_dataset_caleg_training = db['col_dataset_caleg_text']

# Retrieve data from MongoDB collection
data_collection = list(collection_dataset_caleg_training.find().limit(1))

# Convert ObjectId to string
for data in data_collection:
    data['_id'] = str(data['_id'])
    data['original_id'] = str(data['original_id'])

# Print the modified data_collection
corpus = json.loads(json.dumps(data_collection))

with open('corpus_data.pkl', 'wb') as f:
    pickle.dump(corpus, f)

# Extract text data from objects
texts = [item['text'] for item in corpus]

# Load TF-IDF vectorizer and corpus vectors
with open('vsm_model.pkl', 'rb') as f:
    loaded_vsm_model = pickle.load(f)

with open('corpus_vectors.pkl', 'rb') as f:
    corpus_vectors = pickle.load(f)

testing_queries = pd.read_csv('testing_queries.csv')

# Calculating average precision for all queries in the test set
testing_queries['AP'] = testing_queries.apply(lambda x: average_precision(x['qid'], loaded_vsm_model.transform([x['vector']]), corpus_vectors), axis=1)

# Finding Mean Average Precision
print('Mean Average Precision=>', testing_queries['AP'].mean())


[nltk_data] Downloading package punkt to /Users/reka-
[nltk_data]     alamsyah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: 