In [5]:
import configparser
import nltk
import xml.etree.ElementTree as ET
import unicodedata
from unidecode import unidecode
import string
import re
import csv
from tqdm import tqdm

In [38]:
config = configparser.ConfigParser()
config.read('PC.cfg')

query_input_file = config.get('DEFAULT', 'LEIA')
query_output_file = config.get('DEFAULT', 'CONSULTAS')
esperados_output_file = config.get('DEFAULT', 'ESPERADOS')

def parse_queries(file_path, first_field, second_field):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    xml_dict = {}
    
    for query in root.findall('QUERY'):
        key = query.find(first_field).text.strip()
        value = query.find(second_field).text.strip()
        xml_dict[key] = value
        
    return xml_dict

def parse_esperados(file_path):
    # Parse the XML data
    tree = ET.parse(file_path)
    root = tree.getroot()

    docvote_counter = 0
    # Initialize the dictionary to store the result
    query_dict = {}

    # Iterate through each QUERY element
    for query in root.findall('QUERY'):
        query_number = query.find('QueryNumber').text
        records = query.find('Records')
        doc_list = {}
        
        for item in records.findall('Item'):
            score = int(item.get('score'))
            docnumber = item.text
            padded_docnumber = f"{int(docnumber):05d} "

            if score != 0:
                if docnumber not in doc_list.keys():
                    docvote_counter = 1
                    doc_list.update({padded_docnumber: docvote_counter})
                else:
                    docvote_counter = doc_list[padded_docnumber] + 1
                    doc_list[padded_docnumber] = docvote_counter
            else:
                if docnumber not in doc_list.keys():
                    docvote_counter = 0
                    doc_list.update({padded_docnumber: docvote_counter})
                else:
                    docvote_counter = doc_list[padded_docnumber]
                    doc_list[padded_docnumber] = docvote_counter
        query_dict[query_number] = doc_list
        
    return query_dict


In [39]:
queries = parse_queries(query_input_file, 'QueryNumber', 'QueryText')
esperados = parse_esperados(query_input_file)

In [8]:
def format_text(text):
    text = text.upper()
    # Remove accents
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove newlines and extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [9]:
formatted_queries = {key: format_text(value) for key, value in queries.items()}
# formatted_queries

In [10]:
def write_queries_to_csv(queries, output_file):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["QueryNumber", "QueryText"])
        for query_number, query_text in queries.items():
            writer.writerow([query_number, query_text])

write_queries_to_csv(formatted_queries, query_output_file)
            

In [41]:
with open(esperados_output_file, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=';')
    
    for query_number, records in esperados.items():
        for docnumber, docvote in records.items():
            csv_writer.writerow([query_number, docnumber, docvote])