# RRDV usando GENSIM

In [None]:
# pip install gensim

In [1]:
import nbimporter
from punto2_bsii import ingest_naf_documents, preprocess_text
from gensim.utils import simple_preprocess
from gensim import corpora, models, similarities
import pandas as pd
import numpy as np

### Constantes

**Acá se deben cambiar los paths a las carpetas docs-raw-texts y queries-raw-texts**

In [3]:
DOCS_RAW_DIRECTORY_PATH = "./docs-raw-texts/"
QUERIES_DIRECTORY_PATH = "./queries-raw-texts/"

Se inicia ingestando y pre-procesando los documentos y queries, usando las funciones definidas en `punto2_bsii.ipynb`. Luego, se convierten de DataFrame a lista, para que puedan ser leídos por GENSIM.

In [4]:
df_documents = ingest_naf_documents(DOCS_RAW_DIRECTORY_PATH)
df_documents['tokens'] = df_documents['raw_text'].apply(lambda x: preprocess_text(x, language='english'))
doc_ids = df_documents['ID'].tolist()
docs_tokens = df_documents['tokens'].tolist()

df_queries = ingest_naf_documents(QUERIES_DIRECTORY_PATH)
df_queries['tokens'] = df_queries['raw_text'].apply(lambda x: preprocess_text(x, language='english'))
query_ids = df_queries['ID'].tolist()
queries_tokens = df_queries['tokens'].tolist()

In [8]:
# Mostrar las primeras 5 listas de tokens de documentos y queries
print("Primeras 5 listas de tokens de documentos:")
print(docs_tokens[:5])
print("\nPrimeras 5 listas de tokens de queries:")
print(queries_tokens[:5])

Primeras 5 listas de tokens de documentos:
[['william', 'makepeac', 'thackeray', 'deft', 'skewer', 'human', 'foibl', 'william', 'makepeac', 'thackeray', 'deft', 'skewer', 'human', 'foibl', 'william', 'makepeac', 'thackeray', 'on', 'juli', 'english', 'novelist', 'william', 'makepeac', 'thackeray', 'born', 'he', 'famou', 'satir', 'work', 'particularli', 'vaniti', 'fair', 'panoram', 'portrait', 'english', 'societi', 'dure', 'victorian', 'era', 'thackeray', 'rank', 'second', 'charl', 'dicken', 'much', 'less', 'read', 'known', 'almost', 'exclus', 'vaniti', 'fair', 'becom', 'standard', 'fixtur', 'univers', 'cours', 'repeatedli', 'adapt', 'movi', 'televis', 'thackeray', 'child', 'born', 'calcutta', 'india', 'father', 'richmond', 'thackeray', 'secretari', 'board', 'revenu', 'british', 'east', 'india', 'compani', 'william', 'father', 'die', 'caus', 'mother', 'ann', 'becher', 'send', 'england', 'educ', 'school', 'southampton', 'chiswick', 'charterhous', 'school', 'in', 'went', 'triniti', 'colleg

Se convierten los tokens de los documentos a diccionario y corpus de Bag of Words de GENSIM.

In [9]:
dictionary = corpora.Dictionary(docs_tokens)
bow_corpus = [dictionary.doc2bow(doc) for doc in docs_tokens]

In [10]:
tfidf = models.TfidfModel(bow_corpus, normalize=True)
tfidf_corpus = tfidf[bow_corpus]

In [11]:
index = similarities.SparseMatrixSimilarity(tfidf_corpus, num_features=len(dictionary))

In [14]:
results = []
for qid, q_tokens in zip(query_ids, queries_tokens):
    query_bow = dictionary.doc2bow(q_tokens)
    query_tfidf = tfidf[query_bow]
    sims = index[query_tfidf]
    sim_scores = [(doc_ids[i], sims[i]) for i in range(len(doc_ids)) if sims[i] > 0]
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    sim_str = ",".join([f"{doc}:{score:.4f}" for doc, score in sim_scores])
    line = f"{qid}\t{sim_str}"
    results.append(line)

with open("GENSIM-consultas_resultados.tsv", "w") as f:
    for line in results:
        f.write(line + "\n")

In [16]:
from punto1_metricas import precision_at_k, recall_at_k, ndcg_at_k, mean_average_precision

# Cargar juicios de relevancia (solo dos columnas)
relevance_df = pd.read_csv("relevance-judgments.tsv", sep="\t", names=["query_id", "docs_relevance"])

# Procesar relevancia: {query_id: {doc_id: grado}}
relevance_dict = {}
for _, row in relevance_df.iterrows():
    qid = row["query_id"]
    docs_str = str(row["docs_relevance"])
    docs = docs_str.split(",")
    doc_rel = {}
    for d in docs:
        if ":" in d:
            doc_id, rel = d.split(":")
            doc_rel[doc_id] = int(rel)
    relevance_dict[qid] = doc_rel

# Leer resultados generados por GENSIM
with open("GENSIM-consultas_resultados.tsv") as f:
    lines = f.readlines()

p_at_m_list = []
r_at_m_list = []
ndcg_at_m_list = []
ap_vectors = []

for line in lines:
    query_id, docs_str = line.strip().split("\t")
    docs_scores = [d.split(":") for d in docs_str.split(",") if ":" in d]
    ranked_doc_ids = [doc for doc, _ in docs_scores]
    # Vector binario de relevancia para P@M y R@M
    rel_vector_bin = [1 if doc in relevance_dict.get(query_id, {}) and relevance_dict[query_id][doc] > 0 else 0 for doc in ranked_doc_ids]
    # Vector no binario para NDCG
    rel_vector_full = [relevance_dict.get(query_id, {}).get(doc, 0) for doc in ranked_doc_ids]
    M = len(relevance_dict.get(query_id, {}))  # Número de documentos relevantes para la consulta
    if M == 0:
        continue  # No hay relevantes para esta consulta
    p_at_m = precision_at_k(rel_vector_bin, M)
    r_at_m = recall_at_k(rel_vector_bin, M, M)
    ndcg_at_m = ndcg_at_k(rel_vector_full, M)
    p_at_m_list.append(p_at_m)
    r_at_m_list.append(r_at_m)
    ndcg_at_m_list.append(ndcg_at_m)
    ap_vectors.append(rel_vector_bin)

# MAP general
map_score = mean_average_precision(ap_vectors)

print(f"Promedio P@M: {sum(p_at_m_list)/len(p_at_m_list):.4f}")
print(f"Promedio R@M: {sum(r_at_m_list)/len(r_at_m_list):.4f}")
print(f"Promedio NDCG@M: {sum(ndcg_at_m_list)/len(ndcg_at_m_list):.4f}")
print(f"MAP: {map_score:.4f}")

Promedio P@M: 0.6273
Promedio R@M: 0.6273
Promedio NDCG@M: 0.7061
MAP: 0.7014
