# Tarea 1

## Recuperación ranqueada y vectorización de documentos (RRDV)

Se vuelve a crear el indice invertido junto a sus frecuencias

In [12]:
import numpy as np
import pandas as pd
import zipfile
import os
import xml.etree.ElementTree as ET
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

In [13]:
# Se extraen los datos de los archivos comprimidos

# Specify the paths to the compressed files and the target directory
compressed_files = ['docs-raw-texts.zip', 'queries-raw-texts.zip']

num_documents = 0
# Extract files from each compressed file
for compressed_file in compressed_files:
    with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
        folder_name = os.path.splitext(compressed_file)[0]  # Remove the ".zip" extension
        target_folder = os.path.join(folder_name)
        
        if not os.path.exists(target_folder):
            # Create the folder within the target directory
            os.mkdir(target_folder)

        
            # Extract all files to the target folder
            zip_ref.extractall(target_folder)

print("Extracción completada")

Extracción completada


### Paths a directorios

In [14]:
# Directorios que contienen los archivos necesarios, cambiar acá si es necesario
xml_files_directory = 'docs-raw-texts'

relevance_judgments_directory = "relevance-judgments.tsv"

queries_directory = "queries-raw-texts"

### Función de preprocesamiento de texto

In [15]:
# Download the NLTK stopwords resource
nltk.download('stopwords')
# NLTK setup
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Función de preprocesamiento, se usará para todos los inputs al modelo (queries y documentos)
def preprocess_text(text):
    text.strip().lower() # normalización del texto, todo en minúscula y se quitan espacios innecesarios.
    tokens = tokenizer.tokenize(text) #tokenización por espacio
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words] # eliminación de palabras vacias y stemming
    return tokens # retorna lista con el texto tokenizado

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def extract_raw_text(xml_path: str) -> str:
    """Extracts raw text from a .naf file.

    Args:
        xml_path (str): Path to the .naf file.

    Returns:
        str: The raw text from the .naf file.
    """
    # Parse the XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Extract content from the XML
    raw_text = root.find('raw').text

    # Check if the title is present in the raw text
    title = root.find(".//nafHeader/fileDesc").get("title")
    if title and title not in raw_text:
        raw_text = title + ", " + raw_text

    return raw_text

In [17]:
# Dictionary to store the inverted index (term -> list of documents)
inverted_index = {}

# Dictionary to store term frequencies per document (term -> {document: frequency})
term_freq_per_document = {}

# Iterate over XML files in the directory
for filename in os.listdir(xml_files_directory):
    if filename.endswith('.naf'):
        xml_path = os.path.join(xml_files_directory, filename)
        content = extract_raw_text(xml_path)
        # Preprocess the content
        preprocessed_tokens = preprocess_text(content)
        
        # Create the inverted index and update term frequencies per document
        for term in preprocessed_tokens:
            if term in inverted_index:
                if filename not in inverted_index[term]:
                    inverted_index[term].append(filename)
            else:
                inverted_index[term] = [filename]
            
            if term in term_freq_per_document:
                if filename in term_freq_per_document[term]:
                    term_freq_per_document[term][filename] += 1
                else:
                    term_freq_per_document[term][filename] = 1
            else:
                term_freq_per_document[term] = {filename: 1}

print("Inverted index created.")

Inverted index created.


### TF-IDF

$$
TF\text{-}IDF_{t,d} = \log(1 + \text{TF}_{t,d}) \times \log \left( \frac{N}{\text{DF}_t} \right)
$$

In [18]:
def preprocess_input(query_string):
    query_terms = preprocess_text(query_string)
    term_frequency = {}

    for term in query_terms:
        if term in term_frequency:
            term_frequency[term] += 1
        else:
            term_frequency[term] = 1
    
    return term_frequency

# Function to calculate TF-IDF vector for a query
def calculate_tfidf_vector(input, inverted_index, term_freq_per_document_=None, xml_files_directory=xml_files_directory):
    total_documents = len(os.listdir(xml_files_directory))
    query = preprocess_input(input)
    term_indices = {}  # Map term indices to integer indices in the tfidf_vector
    
    for term_index, term in enumerate(inverted_index):
        term_indices[term] = term_index
    
    tfidf_vector = np.zeros(len(inverted_index))  # Initialize a vector of zeros
    
    for term, term_index in inverted_index.items():
        tf = query.get(term, 0)  # Term frequency in the query
        df = len(term_index)
        
        tfidf = (np.log10(1 + tf)) * np.log10(total_documents / df)
        index = term_indices[term]
        tfidf_vector[index] = tfidf
    
    return tfidf_vector

# Example usage
query_string = "William Beaumont is Confused by human physiology"
tfidf_vector = calculate_tfidf_vector(query_string, inverted_index)
print("TF-IDF Vector for query:", query_string)
print(tfidf_vector)
print("elements: {}".format(len(tfidf_vector)))
print("non-zero elements: {}".format(sum(tfidf_vector>0)))

TF-IDF Vector for query: William Beaumont is Confused by human physiology
[0.24710288 0.75854381 0.22326695 ... 0.         0.         0.        ]
elements: 13631
non-zero elements: 5


In [19]:
def calculate_cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    
    if norm_vector1 == 0 or norm_vector2 == 0:
        return 0.0
    
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

In [20]:
# Function to retrieve and rank documents based on cosine similarity scores
def retrieve_and_rank_documents(query_string, inverted_index, term_freq_per_document, xml_files_directory=xml_files_directory):
    similarity_scores = {}  # Dictionary to store cosine similarity scores
    query_vector = calculate_tfidf_vector(query_string, inverted_index, term_freq_per_document)
    
    for document in os.listdir(xml_files_directory):
        if document.endswith('.naf'):
            document_path = os.path.join(xml_files_directory, document)
            document_text = extract_raw_text(document_path)
            document_vector = calculate_tfidf_vector(document_text, inverted_index, term_freq_per_document, xml_files_directory=xml_files_directory)
            similarity = calculate_cosine_similarity(query_vector, document_vector)
            
            if similarity > 0:
                similarity_scores[document[:-4]] = similarity  # Remove the ".naf" extension
    
    ranked_documents = sorted(similarity_scores.keys(), key=lambda doc: similarity_scores[doc], reverse=True)
    return ranked_documents, similarity_scores

output_filename = "RRDV-consultas_resultados.tsv"

if os.path.exists(output_filename):
    print("Results in {} already exist".format(output_filename))
else:
    # Write results to file
    results_file = open(output_filename, "w")

    # Iterate over queries
    for query_file in os.listdir(queries_directory):
        if query_file.endswith('.naf'):
            query_path = os.path.join(queries_directory, query_file)
            query_text = extract_raw_text(query_path)
            
            ranked_documents, similarity_scores = retrieve_and_rank_documents(query_text, inverted_index, term_freq_per_document)
            
            # Write results to the file
            result_line = query_file[8:-4] + "\t" + ",".join([f"{doc[8:]}:{similarity_scores[doc]:.6f}" for doc in ranked_documents])
            results_file.write(result_line + "\n")
            print("finished query {}".format(query_file))

    results_file.close()
    print("Results written to "+ output_filename)


Results in RRDV-consultas_resultados.tsv already exist


In [21]:
from metrics import precision_at_k, recall_at_k, ndcg_at_k, mean_average_precision

# filtra los resultados de las listas para solo incluir los nombres de los documentos
def filter_result_list(input_list):
    return[full_result[:full_result.find(":")] for full_result in input_list]

# Load the relevance judgments from the file
relevance_judgments_file = open(relevance_judgments_directory, "r")
relevance_judgments = {}

for line in relevance_judgments_file:
    query, judgments = line.strip().split('\t')
    relevance_judgments[query] = judgments.split(',')

relevance_judgments_file.close()

# Load the query results from the file
query_results_file = open("RRDV-consultas_resultados.tsv", "r")
query_results = {}

for line in query_results_file:
    query, results = line.strip().split('\t')
    query_results[query] = results.split(',')

query_results_file.close()

# Calculate metrics for each query
metrics_by_query = {}

for query in query_results:
    query_judgments = relevance_judgments.get(query, [])
    query_results_list = query_results[query]
    query_results_list_filtered = filter_result_list(query_results_list)
    query_judgments_filtered = filter_result_list(query_judgments)
    
    if query in relevance_judgments:
        # Convert relevance judgments to binary values
        relevance_vector = [1 if doc in query_judgments_filtered else 0 for doc in query_results_list_filtered]
        
        # Calculate Precision@M and Recall@M
        precision = precision_at_k(relevance_vector, len(query_judgments))
        recall = recall_at_k(relevance_vector, len(query_judgments), len(query_judgments))
        
        # Calculate NDCG@M using non-binary relevance values
        ndcg = ndcg_at_k([int(judgment[judgment.find(":")+1:]) for judgment in query_judgments], len(query_judgments))
        
        metrics_by_query[query] = {"precision": precision, "recall": recall, "ndcg": ndcg}
    else:
        metrics_by_query[query] = {"precision": 0.0, "recall": 0.0, "ndcg": 0.0}

# Calculate MAP
binary_relevance_vectors = []
for query in metrics_by_query:
    if query in relevance_judgments:
        query_judgments = relevance_judgments[query]
        query_judgments_filtered = filter_result_list(query_judgments)
        
        query_results_list = query_results[query]
        query_results_list_filtered = filter_result_list(query_results_list)

        # Convert relevance judgments to binary values
        relevance_vector = [1 if doc in query_judgments_filtered else 0 for doc in query_results_list_filtered]
        binary_relevance_vectors.append(relevance_vector)

map_value = mean_average_precision(binary_relevance_vectors)

print("Metrics by Query:")
for query, metrics in metrics_by_query.items():
    print(f"Query: {query}")
    print(f"P@M: {metrics['precision']:.4f}, R@M: {metrics['recall']:.4f}, NDCG@M: {metrics['ndcg']:.4f}")
    print()

print(f"MAP: {map_value:.4f}")

Metrics by Query:
Query: q01
P@M: 0.3333, R@M: 0.3333, NDCG@M: 0.9705

Query: q02
P@M: 0.5455, R@M: 0.5455, NDCG@M: 0.8576

Query: q03
P@M: 1.0000, R@M: 1.0000, NDCG@M: 0.9717

Query: q04
P@M: 0.7143, R@M: 0.7143, NDCG@M: 0.9756

Query: q06
P@M: 0.8333, R@M: 0.8333, NDCG@M: 0.8140

Query: q07
P@M: 0.2500, R@M: 0.2500, NDCG@M: 0.9853

Query: q08
P@M: 0.7500, R@M: 0.7500, NDCG@M: 0.8914

Query: q09
P@M: 0.8333, R@M: 0.8333, NDCG@M: 1.0000

Query: q10
P@M: 0.5000, R@M: 0.5000, NDCG@M: 0.8358

Query: q12
P@M: 0.7500, R@M: 0.7500, NDCG@M: 0.9891

Query: q13
P@M: 0.8000, R@M: 0.8000, NDCG@M: 0.8077

Query: q14
P@M: 0.5833, R@M: 0.5833, NDCG@M: 0.8661

Query: q16
P@M: 0.5000, R@M: 0.5000, NDCG@M: 1.0000

Query: q17
P@M: 0.7500, R@M: 0.7500, NDCG@M: 0.9281

Query: q18
P@M: 0.8571, R@M: 0.8571, NDCG@M: 0.9205

Query: q19
P@M: 0.5000, R@M: 0.5000, NDCG@M: 1.0000

Query: q22
P@M: 0.4286, R@M: 0.4286, NDCG@M: 1.0000

Query: q23
P@M: 0.2500, R@M: 0.2500, NDCG@M: 0.8071

Query: q24
P@M: 0.2000, R@M: