In [31]:
from xml import dom

import numpy as np
import pandas as pd

import re
import unidecode

import configparser

In [112]:
config = configparser.ConfigParser()
config.read('conf.ini');

In [198]:
def get_xml_field(xml_dom, field_name):
    objects = xml_dom.getElementsByTagName(field_name)
    objects_list = []

    for o in objects:
        objects_list.append(o.firstChild.data)
    return objects_list

def text_handler(text):
    # only keep alphanumeric and spaces
    res = re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    
    # remove extra spaces
    res = re.sub(' +', ' ', res)
    
    # convert accented characters
    res = unidecode.unidecode(res)
    
    if res[-1] == ' ':
        res = res[:-1]
    if res[0] == ' ':
        res = res[1:]
    
    return res.upper()

class ProcessadorConsultas():
    def __init__(self, config):
        self.leia = config.get('pc', 'leia')
        self.consultas = config.get('pc', 'consultas')
        self.esperados = config.get('pc', 'esperados')
        self.dom = None
    
    def le_entrada(self):
        self.dom = dom.minidom.parse(self.leia)
        
        self.query_number = get_xml_field(lido, 'QueryNumber')
        self.query_text = get_xml_field(lido, 'QueryText')
        self.results = get_xml_field(lido, 'Results')
        
        self.records = lido.getElementsByTagName('Records')

        self.doc_number = []
        self.score = []

        for r in self.records:
            record_doc = []
            record_score = []

            for i in r.getElementsByTagName('Item'):
                record_doc.append(i.firstChild.data)
                record_score.append(i.getAttribute('score'))

            self.doc_number.append(record_doc)
            self.score.append(record_score)
    
    def gera_consultas(self):
        if self.dom == None:
            self.le_entrada()
        
        consultas = pd.DataFrame({
            'QueryNumber': self.query_number,
            'QueryText': self.query_text
        })

        consultas['QueryNumber'] = consultas['QueryNumber'].astype(int)
        consultas['QueryText'] = consultas['QueryText'].map(text_handler)

        consultas.sort_values('QueryNumber', inplace = True)

        consultas.to_csv(self.consultas, sep = ";", index = False)
        return consultas
    
    def gera_esperados(self):
        if self.dom == None:
            self.le_entrada()
        
        esperados = pd.DataFrame({
            'QueryNumber': self.query_number,
            'DocNumber': self.doc_number,
            'DocVotes': self.score
        }).explode(['DocNumber', 'DocVotes'])

        esperados['QueryNumber'] = esperados['QueryNumber'].astype(int)
        esperados['DocNumber'] = esperados['DocNumber'].astype(int)
        esperados['DocVotes'] = esperados['DocVotes'].map(lambda x: sum([int(i) for i in x]))

        esperados.sort_values(['QueryNumber', 'DocNumber'], inplace = True)
        esperados.to_csv(self.esperados, sep = ";", index = False)
        return esperados
    
    def run(self):
        self.le_entrada()
        self.gera_consultas()
        self.gera_esperados()

class GeradorListaInvertida():
    def __init__(self, config):
        self.leia = config.get('gli', 'leia').replace(' ', '').split(',')
        self.escreva = config.get('gli', 'escreva')
        self.record_num = None
    
    def le(self):
        self.record_num = []
        self.abstract = []
        
        for file in self.leia:
            file_dom = dom.minidom.parse(file)
            records = file_dom.getElementsByTagName('RECORD')
            for r in records:
                self.record_num.append(r.getElementsByTagName('RECORDNUM')[0].firstChild.data)

                # try getting abstract
                record_abstract = r.getElementsByTagName('ABSTRACT')

                if len(record_abstract) > 0:
                    self.abstract.append(record_abstract[0].firstChild.data)
                else:
                    # if no abstract try getting extract
                    record_abstract = r.getElementsByTagName('EXTRACT')

                    if len(record_abstract) > 0:
                        self.abstract.append(record_abstract[0].firstChild.data)
                    else:
                        # if no extract try getting title
                        record_abstract = r.getElementsByTagName('TITLE')

                        if len(record_abstract) > 0:
                            self.abstract.append(record_abstract[0].firstChild.data)
                        else:
                            self.abstract.append('')
        self.handled_abstract = [text_handler(i).split(' ') for i in self.abstract]
    
    def escreve(self):
        if self.record_num == None:
            self.le()
        
        escreva = pd.DataFrame({
            'RecordNum': self.record_num,
            'Abstract': self.handled_abstract
        }).explode('Abstract')

        escreva['RecordNum'] = escreva['RecordNum'].astype(int)
        escreva = escreva.groupby('Abstract')['RecordNum'].apply(lambda x: sorted(list(x)))
        escreva = pd.DataFrame(escreva).reset_index()
        escreva.to_csv(self.escreva, sep = ';', index = False)
        return escreva

In [178]:
leia = config.get('pc', 'leia')

In [179]:
pc = ProcessadorConsultas(config)
pc.run()

gli = GeradorListaInvertida(config)
gli.escreve()

Unnamed: 0,Abstract,RecordNum
0,0,"[47, 47, 47, 61, 61, 62, 62, 62, 62, 70, 70, 7..."
1,00,"[1151, 1151, 1151, 1151, 1151, 1151, 1200]"
2,000,"[9, 39, 69, 139, 175, 214, 214, 307, 318, 400,..."
3,00005,[62]
4,0001,"[275, 306]"
...,...,...
9873,ZONE,"[47, 47, 141, 155, 367]"
9874,ZONES,"[47, 194, 361, 361]"
9875,ZYMOGEN,"[40, 314, 1135]"
9876,ZYMOGRAMS,[150]
