# Tarea 1

## Recuperación ranqueada y vectorización de documentos (RRDV) usando GENSIM

In [2]:
import os
import zipfile
import xml.etree.ElementTree as ET
import numpy as np
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models, similarities
from smart_open import smart_open

In [3]:
# Specify the paths to the compressed files and the target directory
compressed_files = ['docs-raw-texts.zip', 'queries-raw-texts.zip']

In [4]:
# Extract files from each compressed file
for compressed_file in compressed_files:
    with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
        folder_name = os.path.splitext(compressed_file)[0]  # Remove the ".zip" extension
        target_folder = os.path.join(folder_name)
        
        if not os.path.exists(target_folder):
            # Create the folder within the target directory
            os.mkdir(target_folder)
        
            # Extract all files to the target folder
            zip_ref.extractall(target_folder)

print("Extracción completada")

Extracción completada


In [5]:
# Directorios
xml_files_directory = 'docs-raw-texts'

relevance_judgments_directory = "relevance-judgments.tsv"

queries_directory = "queries-raw-texts"

gensim_ext = "gensim"

In [6]:
def extract_raw_text(xml_path: str) -> str:
    """Extracts raw text from a .naf file.

    Args:
        xml_path (str): Path to the .naf file.

    Returns:
        str: The raw text from the .naf file.
    """
    # Parse the XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Extract content from the XML
    raw_text = root.find('raw').text

    # Check if the title is present in the raw text
    title = root.find(".//nafHeader/fileDesc").get("title")
    if title and title not in raw_text:
        raw_text = title + ", " + raw_text

    return raw_text

In [7]:
p = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')
# Función de preprocesamiento, se usará para todos los inputs al modelo (queries y documentos)
def preprocess_text(text: str):
    """Preprocesa un texto para eliminar palabras vacías, aplicar stemming y convertir a minúsculas.

    Args:
        text (str): El texto a preprocesar.

    Returns:
        List: Una lista con las palabras del texto preprocesado.
    """
    text = text.strip().lower()  # Normalización del texto, todo en minúscula y se quitan espacios innecesarios.
    doc_sw = remove_stopwords(text)
    doc_stem = p.stem_sentence(doc_sw)
    # Usar Gensim para aplicar PorterStemmer y eliminar stopwords
    return tokenizer.tokenize(doc_stem) # Retorna lista con el texto preprocesado

In [9]:
text_corpus_en = []

# Path to the new file
corpus_file_path = os.path.join(gensim_ext, "mycorpusGensim.txt")

with open(corpus_file_path, 'w', encoding='utf-8') as corpus_file:
    # Iterate over XML files in the directory
    for filename in os.listdir(xml_files_directory):
        if filename.endswith('.naf'):
            xml_path = os.path.join(xml_files_directory, filename)
            content = extract_raw_text(xml_path)
            content_without_newlines = content.replace('\n', '')
            encoded_content = content_without_newlines.encode('utf-8', errors='ignore').decode('utf-8')
            corpus_file.write(encoded_content + '\n')
            # Preprocess the content
            preprocessed_tokens = preprocess_text(content)
            text_corpus_en.append(preprocessed_tokens)

print("Corpus created.")

Corpus created.


In [10]:
docDict = []
for doc in text_corpus_en:
    docDict.append(doc)

In [11]:
## Dictionary == Vocab

dictionary = corpora.Dictionary(docDict)
dictionary.save(os.path.join(gensim_ext, "midict.dict"))
print(dictionary)

print(dictionary.token2id)

Dictionary<17040 unique tokens: ['1', '1785', '1812', '1819', '1820']...>
{'1': 0, '1785': 1, '1812': 2, '1819': 3, '1820': 4, '1822': 5, '1833': 6, '19': 7, '2': 8, '21': 9, '6th': 10, '83': 11, 'accid': 12, 'acid': 13, 'activ': 14, 'affect': 15, 'ag': 16, 'alexi': 17, 'also': 18, 'american': 19, 'armi': 20, 'basi': 21, 'beaumont': 22, 'best': 23, 'better': 24, 'bit': 25, 'book': 26, 'born': 27, 'break': 28, 'broken': 29, 'canadian': 30, 'caus': 31, 'chemic': 32, 'chemical': 33, 'children': 34, 'close': 35, 'compani': 36, 'company': 37, 'complet': 38, 'connecticut': 39, 'consid': 40, 'di': 41, 'did': 42, 'differ': 43, 'digest': 44, 'digestion': 45, 'discov': 46, 'earli': 47, 'emotions': 48, 'examin': 49, 'exist': 50, 'expect': 51, 'experi': 52, 'famou': 53, 'father': 54, 'find': 55, 'fistula': 56, 'follow': 57, 'food': 58, 'fort': 59, 'french': 60, 'fur': 61, 'fuse': 62, 'gain': 63, 'gastric': 64, 'gave': 65, 'healed': 66, 'hole': 67, 'human': 68, 'hydrochlor': 69, 'imag': 70, 'import

In [12]:
##  Market Matrix format
# Step 1: Build the corpus from big file
class MyCorpus(object):
    def __iter__(self):
        for line in smart_open(os.path.join(gensim_ext, "mycorpusGensim.txt"),"rb"):
            yield dictionary.doc2bow(preprocess_text(line))

corpus_memory_friendly = MyCorpus()
corpora.MmCorpus.serialize(os.path.join(gensim_ext, "corpus.mm"), corpus_memory_friendly)
corpus = corpora.MmCorpus(os.path.join(gensim_ext, "corpus.mm"))

In [13]:
# Step 2: Build Tfidf model from corpus
dictionary = corpora.Dictionary.load(os.path.join(gensim_ext, 'midict.dict'))
corpus = corpora.MmCorpus(os.path.join(gensim_ext, "corpus.mm"))
tfidf = models.TfidfModel(corpus)

# You can transform any doc using your model
query = "William is counfused by Jahren"
query_doc_bow = dictionary.doc2bow(preprocess_text(query)) # Important: Same corpus preprocess
print(query_doc_bow)
print(tfidf[query_doc_bow]) # Model transformation

[(156, 1), (16888, 1)]
[(156, 0.31272766595519425), (16888, 0.9498428327603555)]


In [14]:
# Step 3: Similarity Matrix
# In this step the index is created
index = similarities.MatrixSimilarity(tfidf[corpus])
index.save(os.path.join(gensim_ext, 'similmatrix.index'))

In [15]:
query = "famous German poetry"
query_doc_bow = dictionary.doc2bow(preprocess_text(query))
sims = index[tfidf[query_doc_bow]]
print(list(enumerate(sims)))

# Sort the vector based on the second value in each tuple
vector= list(enumerate(sims))
sorted_vector = sorted(vector, key=lambda x: x[1], reverse=True)

# Print the sorted vector
for index, value in sorted_vector:
    print(f"Index: {index+1}, Value: {value}")

[(0, 0.0018643981), (1, 0.039983794), (2, 0.0009732105), (3, 0.0015990891), (4, 0.0026977835), (5, 0.0), (6, 0.007200899), (7, 0.0), (8, 0.0), (9, 0.002630067), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.026599672), (14, 0.0), (15, 0.0), (16, 0.0058510797), (17, 0.0064943675), (18, 0.0037904577), (19, 0.0070354133), (20, 0.0041049733), (21, 0.004386156), (22, 0.03154721), (23, 0.0011640565), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.004426163), (29, 0.0), (30, 0.0070466986), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0016865033), (37, 0.0), (38, 0.019163031), (39, 0.0022733212), (40, 0.0), (41, 0.0039188503), (42, 0.0023175117), (43, 0.0054729762), (44, 0.0018363607), (45, 0.0070909336), (46, 0.0062780655), (47, 0.0), (48, 0.0), (49, 0.002184173), (50, 0.015210219), (51, 0.024667013), (52, 0.0), (53, 0.0038941426), (54, 0.0027354206), (55, 0.0), (56, 0.0), (57, 0.008874387), (58, 0.0), (59, 0.0), (60, 0.0017537457), (61, 0.0034467692), (62, 0.0015368236), (63, 

In [16]:
# Function to retrieve and rank documents based on cosine similarity scores
def retrieve_and_rank_documents(query):
    index = similarities.MatrixSimilarity(tfidf[corpus])
    query_doc_bow = dictionary.doc2bow(preprocess_text(query))
    sims = index[tfidf[query_doc_bow]]

    # Create a list of tuples containing indices and cosine similarity scores
    vector = list(enumerate(sims))

    # Filter out elements with a second value of 0
    filtered_vector = [(index, score) for index, score in vector if score != 0]

    # Sort the filtered vector based on the second value in each tuple
    sorted_vector = sorted(filtered_vector, key=lambda x: x[1], reverse=True)

    return sorted_vector

retrieve_and_rank_documents("william")

[(272, 0.14452232),
 (101, 0.09588183),
 (0, 0.09410844),
 (309, 0.09198913),
 (68, 0.08810307),
 (329, 0.08574023),
 (319, 0.08528692),
 (135, 0.07976794),
 (27, 0.07877033),
 (14, 0.07159552),
 (55, 0.06619226),
 (87, 0.06284147),
 (77, 0.05607016),
 (34, 0.047378752),
 (137, 0.042765167),
 (54, 0.041422404),
 (265, 0.036632277),
 (94, 0.031149194),
 (188, 0.028508233),
 (288, 0.028160041),
 (240, 0.025858793),
 (178, 0.024602002),
 (298, 0.022618277),
 (229, 0.02172273),
 (290, 0.02075591),
 (105, 0.02073173),
 (90, 0.020709941),
 (97, 0.020508457),
 (299, 0.01943676),
 (271, 0.01918933),
 (308, 0.017881729),
 (273, 0.017512288),
 (211, 0.017258294),
 (190, 0.016207853),
 (128, 0.016124962),
 (322, 0.016063849),
 (253, 0.015653104),
 (179, 0.015650008),
 (110, 0.014702834),
 (146, 0.014657909),
 (196, 0.01410413),
 (256, 0.013719551),
 (230, 0.013614199),
 (91, 0.013137904),
 (192, 0.012312181),
 (108, 0.012135229),
 (293, 0.0103846025),
 (174, 0.010329217),
 (189, 0.009587484)]

In [18]:
output_filename = "GESIM-consultas_resultados.tsv"

# if os.path.exists(output_filename):
#     print("Results in {} already exist".format(output_filename))
# else:
# Write results to file
results_file = open(output_filename, "w")

# Iterate over queries
for query_file in os.listdir(queries_directory):
    if query_file.endswith('.naf'):
        query_path = os.path.join(queries_directory, query_file)
        query_text = extract_raw_text(query_path)

        vec = retrieve_and_rank_documents(query_text)
        # Separate the vector into two lists
        ranked_documents, similarity_scores = zip(*vec)
        similarity_scores = list(similarity_scores)

        # Convert the indices to match your requirement (doc+1)
        ranked_documents_corrected = [doc + 1 for doc in ranked_documents]
        # Write results to the file
        result_line = query_file[8:-4] + "\t" + ",".join([f"d{doc:03}:{similarity_scores[idx]:.6f}" for idx, doc in enumerate(ranked_documents_corrected)])
        results_file.write(result_line + "\n")
        print("finished query {}".format(query_file))

results_file.close()
print("Results written to "+ output_filename)

finished query wes2015.q01.naf
finished query wes2015.q02.naf
finished query wes2015.q03.naf
finished query wes2015.q04.naf
finished query wes2015.q06.naf
finished query wes2015.q07.naf
finished query wes2015.q08.naf
finished query wes2015.q09.naf
finished query wes2015.q10.naf
finished query wes2015.q12.naf
finished query wes2015.q13.naf
finished query wes2015.q14.naf
finished query wes2015.q16.naf
finished query wes2015.q17.naf
finished query wes2015.q18.naf
finished query wes2015.q19.naf
finished query wes2015.q22.naf
finished query wes2015.q23.naf
finished query wes2015.q24.naf
finished query wes2015.q25.naf
finished query wes2015.q26.naf
finished query wes2015.q27.naf
finished query wes2015.q28.naf
finished query wes2015.q29.naf
finished query wes2015.q32.naf
finished query wes2015.q34.naf
finished query wes2015.q36.naf
finished query wes2015.q37.naf
finished query wes2015.q38.naf
finished query wes2015.q40.naf
finished query wes2015.q41.naf
finished query wes2015.q42.naf
finished

In [19]:
from metrics import precision_at_k, recall_at_k, ndcg_at_k, mean_average_precision

# filtra los resultados de las listas para solo incluir los nombres de los documentos
def filter_result_list(input_list):
    return[full_result[:full_result.find(":")] for full_result in input_list]

# Load the relevance judgments from the file
relevance_judgments_file = open(relevance_judgments_directory, "r")
relevance_judgments = {}

for line in relevance_judgments_file:
    query, judgments = line.strip().split('\t')
    relevance_judgments[query] = judgments.split(',')

relevance_judgments_file.close()

# Load the query results from the file
query_results_file = open(output_filename, "r")
query_results = {}

for line in query_results_file:
    query, results = line.strip().split('\t')
    query_results[query] = results.split(',')

query_results_file.close()

# Calculate metrics for each query
metrics_by_query = {}

for query in query_results:
    query_judgments = relevance_judgments.get(query, [])
    query_results_list = query_results[query]
    query_results_list_filtered = filter_result_list(query_results_list)
    query_judgments_filtered = filter_result_list(query_judgments)
    
    if query in relevance_judgments:
        # Convert relevance judgments to binary values
        relevance_vector = [1 if doc in query_judgments_filtered else 0 for doc in query_results_list_filtered]
        
        # Calculate Precision@M and Recall@M
        precision = precision_at_k(relevance_vector, len(query_judgments))
        recall = recall_at_k(relevance_vector, len(query_judgments), len(query_judgments))
        
        # Calculate NDCG@M using non-binary relevance values
        ndcg = ndcg_at_k([int(judgment[judgment.find(":")+1:]) for judgment in query_judgments], len(query_judgments))
        
        metrics_by_query[query] = {"precision": precision, "recall": recall, "ndcg": ndcg}
    else:
        metrics_by_query[query] = {"precision": 0.0, "recall": 0.0, "ndcg": 0.0}

# Calculate MAP
binary_relevance_vectors = []
for query in metrics_by_query:
    if query in relevance_judgments:
        query_judgments = relevance_judgments[query]
        query_judgments_filtered = filter_result_list(query_judgments)
        
        query_results_list = query_results[query]
        query_results_list_filtered = filter_result_list(query_results_list)

        # Convert relevance judgments to binary values
        relevance_vector = [1 if doc in query_judgments_filtered else 0 for doc in query_results_list_filtered]
        binary_relevance_vectors.append(relevance_vector)

map_value = mean_average_precision(binary_relevance_vectors)

print("Metrics by Query:")
for query, metrics in metrics_by_query.items():
    print(f"Query: {query}")
    print(f"P@M: {metrics['precision']:.4f}, R@M: {metrics['recall']:.4f}, NDCG@M: {metrics['ndcg']:.4f}")
    print()

print(f"MAP: {map_value:.4f}")

Metrics by Query:
Query: q01
P@M: 0.6667, R@M: 0.6667, NDCG@M: 0.9705

Query: q02
P@M: 0.3636, R@M: 0.3636, NDCG@M: 0.8576

Query: q03
P@M: 0.5000, R@M: 0.5000, NDCG@M: 0.9717

Query: q04
P@M: 0.7143, R@M: 0.7143, NDCG@M: 0.9756

Query: q06
P@M: 0.8333, R@M: 0.8333, NDCG@M: 0.8140

Query: q07
P@M: 0.5000, R@M: 0.5000, NDCG@M: 0.9853

Query: q08
P@M: 0.6667, R@M: 0.6667, NDCG@M: 0.8914

Query: q09
P@M: 0.8333, R@M: 0.8333, NDCG@M: 1.0000

Query: q10
P@M: 0.3750, R@M: 0.3750, NDCG@M: 0.8358

Query: q12
P@M: 1.0000, R@M: 1.0000, NDCG@M: 0.9891

Query: q13
P@M: 0.4000, R@M: 0.4000, NDCG@M: 0.8077

Query: q14
P@M: 0.6667, R@M: 0.6667, NDCG@M: 0.8661

Query: q16
P@M: 0.5000, R@M: 0.5000, NDCG@M: 1.0000

Query: q17
P@M: 0.5000, R@M: 0.5000, NDCG@M: 0.9281

Query: q18
P@M: 0.7143, R@M: 0.7143, NDCG@M: 0.9205

Query: q19
P@M: 0.5000, R@M: 0.5000, NDCG@M: 1.0000

Query: q22
P@M: 0.5714, R@M: 0.5714, NDCG@M: 1.0000

Query: q23
P@M: 0.2500, R@M: 0.2500, NDCG@M: 0.8071

Query: q24
P@M: 0.0000, R@M:

## ACA vamos