# Recuperación ranqueada y vectorización de documentos (RRDV)

In [78]:
# Librerías necesarias para importar codigo de otros notebooks. Correr con ! antes de pip install.
# pip install nbformat

In [79]:
import nbimporter
from punto2_bsii import parse_naf_document, ingest_naf_documents, preprocess_text, build_inverted_index
import pandas as pd
import numpy as np

### Constantes

**Acá se deben cambiar los paths a las carpetas docs-raw-texts y queries-raw-texts**

In [80]:
DOCS_RAW_DIRECTORY_PATH = "./docs-raw-texts/"
QUERIES_DIRECTORY_PATH = "./queries-raw-texts/"

**[10p] Cree una función que, a partir del índice invertido, cree la representación vectorial ponderada tf.idf
de un documento o consulta. Describa en detalle su estrategia, ¿es eficiente? ¿por qué si, por qué no?**

Inicialmente se cargan y preprocesan los documentos, luego se crea el índice invertido usando la función *build_inverted_index* del notebook `punto2_bsii.ipynb`.

In [81]:

DOCS_RAW_DIRECTORY_PATH = "./docs-raw-texts/"
df_documents = ingest_naf_documents(DOCS_RAW_DIRECTORY_PATH)
df_documents['tokens'] = df_documents['raw_text'].apply(lambda x: preprocess_text(x, language='english'))

inverted_index = build_inverted_index(df_documents, id_col='ID', tokens_col='tokens')

  "metadata": {},


In [82]:
# Se muestran las primeras filas del índice invertido
inverted_index.head()

Unnamed: 0,term,postings,doc_freq,term_freqs
0,a,"[d102, d035, d116, d071, d250, d114, d156, d18...",129,"{'d003': 2, 'd004': 1, 'd006': 1, 'd011': 1, '..."
1,aachen,"[d252, d139, d161]",3,"{'d139': 1, 'd161': 2, 'd252': 1}"
2,aazv,[d156],1,{'d156': 1}
3,ab,[d224],1,{'d224': 1}
4,abadon,[d062],1,{'d062': 1}


In [83]:
def compute_tfidf_matrix_terms_docs(df_documents: pd.DataFrame, inverted_index: pd.DataFrame) -> pd.DataFrame:
    """
    Calcula la matriz tf-idf con filas=terminos y columnas=documentos usando la fórmula:
    w_{t,d} = log10(1 + tf_{t,d}) * log10(N / df_t)
    Donde tf_{t,d} es la frecuencia del término t en el documento d,
    df_t es la frecuencia de documentos que contienen el término t,
    y N es el número total de documentos.
    Args:
        df_documents: DataFrame con los documentos y columna 'tokens'.
        inverted_index: DataFrame del índice invertido con 'term_freqs'.
    Returns:
        DataFrame: matriz tf-idf (filas=terminos, columnas=doc_id)
    """
    N_docs = len(df_documents)
    terms = inverted_index['term'].tolist()
    doc_ids = df_documents['ID'].tolist()
    tfidf_matrix = pd.DataFrame(0.0, index=terms, columns=doc_ids)

    for idx, row in inverted_index.iterrows():
        term = row['term']
        df = row['doc_freq']
        idf = np.log10(N_docs / df) if df > 0 else 0
        term_freqs = row['term_freqs']  # dict {doc_id: freq}
        for doc_id, tf in term_freqs.items():
            tf_weight = np.log10(1 + tf)
            tfidf_matrix.at[term, doc_id] = tf_weight * idf

    return tfidf_matrix



Se aplica la función *compute_tfidf_matrix_terms_docs* usando el índice invertido computado previamente y el dataframe con la frecuencia por documento.

In [84]:
tfidf_matrix_terms_docs = compute_tfidf_matrix_terms_docs(df_documents, inverted_index)
tfidf_matrix_terms_docs.head()

Unnamed: 0,d102,d035,d321,d094,d014,d143,d116,d088,d308,d319,...,d318,d329,d153,d128,d172,d098,d043,d208,d261,d236
a,0.123193,0.123193,0.0,0.0,0.0,0.0,0.195256,0.0,0.0,0.0,...,0.0,0.0,0.123193,0.0,0.123193,0.123193,0.123193,0.123193,0.0,0.0
aachen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aazv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abadon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**[10p] Cree una función que reciba dos vectores de documentos y calcule la similitud del coseno**

Se construye una implementación para comparar dos vectores, también se realiza una implementación para comparar un vector de consultas con una matriz tf_idf.

In [85]:
def cosine_similarity(vec1, vec2):
    """
    Calcula la similitud del coseno entre dos vectores usando numpy.
    Args:
        vec1: numpy array o pandas Series.
        vec2: numpy array o pandas Series.
    Returns:
        float: similitud del coseno (entre 0 y 1).
    """
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

In [86]:
def cosine_similarity_matrix(query_vec, doc_matrix):
    """
    Calcula la similitud del coseno entre un vector de consulta y todos los vectores de documentos.
    Args:
        query_vec: numpy array (shape: n_terms,)
        doc_matrix: numpy array (shape: n_terms, n_docs)
    Returns:
        numpy array (shape: n_docs,) con las similitudes.
    """
    dot_products = np.dot(query_vec, doc_matrix)
    doc_norms = np.linalg.norm(doc_matrix, axis=0)
    query_norm = np.linalg.norm(query_vec)
    # Evita división por cero
    with np.errstate(divide='ignore', invalid='ignore'):
        sims = np.where((doc_norms > 0) & (query_norm > 0),
                        dot_products / (doc_norms * query_norm), 0.0)
    return sims

**[5p] Para cada una de las 35 consultas en el conjunto de datos, recupere los documentos clasificados -
ordenados por el puntaje de similitud del coseno- (incluya solo los documentos con un puntaje superior a 0
para una consulta determinada).**

#### 1. Extraer las queries

La ingesta de los documentos NAF con queries se hace reutilizando la función *ingest_naf_documents*. Se organizan las consultas por ID, de forma ascendente.

In [87]:
def ingest_queries(directory):
    df_queries = ingest_naf_documents(directory)
    return df_queries[['ID', 'raw_text']]

df_queries = ingest_queries(QUERIES_DIRECTORY_PATH)
df_queries = df_queries.sort_values('ID')

#### 2. Calcular similitud coseno

Para calcular la similitud coseno, Para calcular la similitud coseno, se tiene en cuenta las siguientes optimizaciones:

- Cálculo simultáneo de similitudes: La similitud coseno entre la consulta y todos los documentos se calcula en una sola operación usando la función *cosine_similarity_matrix*, evitando bucles innecesarios y aprovechando la eficiencia de las operaciones matriciales.
- Se utilizan diccionarios en Python para acceder rápidamente a la información de relevancia de cada documento. El método get() de los diccionarios permite consultar si un documento es relevante para una consulta específica de forma eficiente y segura, permitiendo acceder en tiempo constante `(O(1))` a la relevancia de cada documento dado su ID. Esto evita recorridos innecesarios sobre la totalidad de la matriz.

In [88]:
tfidf_matrix_terms_docs = compute_tfidf_matrix_terms_docs(df_documents, inverted_index)

In [89]:
term_to_idx = {term: i for i, term in enumerate(tfidf_matrix_terms_docs.index)}

In [90]:
inverted_index

Unnamed: 0,term,postings,doc_freq,term_freqs
0,a,"[d102, d035, d116, d071, d250, d114, d156, d18...",129,"{'d003': 2, 'd004': 1, 'd006': 1, 'd011': 1, '..."
1,aachen,"[d252, d139, d161]",3,"{'d139': 1, 'd161': 2, 'd252': 1}"
2,aazv,[d156],1,{'d156': 1}
3,ab,[d224],1,{'d224': 1}
4,abadon,[d062],1,{'d062': 1}
...,...,...,...,...
12767,zurich,"[d014, d143, d113, d112, d030, d047, d059, d21...",11,"{'d014': 2, 'd030': 2, 'd047': 2, 'd059': 1, '..."
12768,zuse,"[d211, d202]",2,"{'d202': 1, 'd211': 20}"
12769,zwicki,[d253],1,{'d253': 17}
12770,zworykin,"[d071, d068]",2,"{'d068': 1, 'd071': 4}"


In [91]:
tfidf_matrix_terms_docs = compute_tfidf_matrix_terms_docs(df_documents, inverted_index)
term_to_idx = {term: i for i, term in enumerate(tfidf_matrix_terms_docs.index)}
doc_ids = tfidf_matrix_terms_docs.columns.tolist()
tfidf_matrix_np = tfidf_matrix_terms_docs.values  # (n_terms, n_docs)
N_docs = len(df_documents)

results = []
for _, row in df_queries.iterrows():
    query_id = row['ID']
    query_text = row['raw_text']
    query_tokens = preprocess_text(query_text, language='english')
    tf_counts = pd.Series(query_tokens).value_counts()
    query_vec = np.zeros(len(term_to_idx))
    # Solo llena los términos presentes en la consulta
    for term, tf in tf_counts.items():
        idx = term_to_idx.get(term)
        if idx is not None:
            idx_row = inverted_index[inverted_index['term'] == term]
            if not idx_row.empty:
                df = idx_row.iloc[0]['doc_freq']
                idf = np.log10(N_docs / df) if df > 0 else 0
                tf_weight = np.log10(1 + tf)
                query_vec[idx] = tf_weight * idf
    # Vectorizado: calcula todas las similitudes de una vez
    sims = cosine_similarity_matrix(query_vec, tfidf_matrix_np)
    sim_scores = [(doc_ids[i], sims[i]) for i in range(len(doc_ids)) if sims[i] > 0]
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    sim_str = ",".join([f"{doc}:{score:.4f}" for doc, score in sim_scores])
    line = f"{query_id}\t{sim_str}"
    results.append(line)

with open("resultados/RRDV-consultas_resultados.tsv", "w") as f:
    for line in results:
        f.write(line + "\n")

[10p] Evaluación de resultados. Calcule P@M, R@M, NDCG@M por consulta. M es el número de
documentos relevantes encontrados en el archivo de juicios de relevancia por consulta. Luego calcule MAP
como una métrica general.

NOTA I: Para P@M y R@M suponga una escala de relevancia binaria. Los documentos que no se
encuentran en el archivo "relevance-judgments" NO son relevantes para una consulta determinada.

NOTA II: Para NDCG@M utilice la escala de relevancia no binaria que se encuentra en el archivo
"relevance-judgments".

#### Extracción de documento con relevance judgements e importar funciones para calcular las métricas

In [92]:
from punto1_metricas import precision_at_k, recall_at_k, ndcg_at_k, mean_average_precision

In [93]:
relevance_df = pd.read_csv("relevance-judgments.tsv", sep="\t", names=["query_id", "docs_relevance"])

# Procesar relevancia: {query_id: {doc_id: grado}}
relevance_dict = {}
for _, row in relevance_df.iterrows():
    qid = row["query_id"]
    docs_str = str(row["docs_relevance"])
    docs = docs_str.split(",")
    doc_rel = {}
    for d in docs:
        if ":" in d:
            doc_id, rel = d.split(":")
            doc_rel[doc_id] = int(rel)
    relevance_dict[qid] = doc_rel

In [94]:
print("Ver formato de relevance_dict:")
for idx in ('q01', 'q02', 'q03'):
    print(idx, relevance_dict[idx])

Ver formato de relevance_dict:
q01 {'d186': 4, 'd254': 5, 'd016': 5}
q02 {'d136': 2, 'd139': 2, 'd143': 4, 'd283': 4, 'd228': 4, 'd164': 4, 'd318': 2, 'd291': 4, 'd293': 4, 'd147': 2, 'd149': 2}
q03 {'d152': 3, 'd291': 4, 'd283': 4, 'd147': 3, 'd318': 2, 'd105': 2}


Se leen los resultados generados.

In [None]:
with open("resultados/RRDV-consultas_resultados.tsv") as f:
    lines = f.readlines()

Para cada consulta, se extraen los documentos recuperados y sus scores.
Luego, se construyen dos vectores de relevancia:
- Un vector binario para calcular P@M y R@M, donde 1 indica un documento relevante.
- Un vector con los grados de relevancia para calcular NDCG@M.

Con estos vectores, se calculan las métricas P@M, R@M y NDCG@M. Al final, se calcula MAP general.

In [None]:
p_at_m_list = []
r_at_m_list = []
ndcg_at_m_list = []
ap_vectors = []
for line in lines:
    query_id, docs_str = line.strip().split("\t")
    docs_scores = [d.split(":") for d in docs_str.split(",") if ":" in d]
    ranked_doc_ids = [doc for doc, _ in docs_scores]
    # Vector binario de relevancia para P@M y R@M
    rel_vector_bin = [1 if doc in relevance_dict.get(query_id, {}) and relevance_dict[query_id][doc] > 0 else 0 for doc in ranked_doc_ids]
    # Vector no binario para NDCG
    rel_vector_full = [relevance_dict.get(query_id, {}).get(doc, 0) for doc in ranked_doc_ids]
    M = len(relevance_dict.get(query_id, {}))  # Número de documentos relevantes para la consulta
    if M == 0:
        continue  # No hay relevantes para esta consulta
    p_at_m = precision_at_k(rel_vector_bin, M)
    r_at_m = recall_at_k(rel_vector_bin, M, M)
    ndcg_at_m = ndcg_at_k(rel_vector_full, M)
    p_at_m_list.append(p_at_m)
    r_at_m_list.append(r_at_m)
    ndcg_at_m_list.append(ndcg_at_m)
    ap_vectors.append(rel_vector_bin)

map_score = mean_average_precision(ap_vectors)

In [96]:
print(f"Promedio P@M: {sum(p_at_m_list)/len(p_at_m_list):.4f}")
print(f"Promedio R@M: {sum(r_at_m_list)/len(r_at_m_list):.4f}")
print(f"Promedio NDCG@M: {sum(ndcg_at_m_list)/len(ndcg_at_m_list):.4f}")
print(f"MAP: {map_score:.4f}")

Promedio P@M: 0.6353
Promedio R@M: 0.6353
Promedio NDCG@M: 0.7326
MAP: 0.7352
