In [1]:
import configparser
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import xml.etree.ElementTree as ET
import unicodedata
import re
import csv
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Lendo arquivo de configuração

In [2]:
config = configparser.ConfigParser()
config.read('GLI.cfg')

xml_paths = [path.strip() for path in config.get('DEFAULT', 'LEIA').split(',')]
li_path = config.get('DEFAULT', 'ESCREVA')

Abrindo arquivos recuperados do arquivo de configuração

In [3]:
try:
    fs = [open(path, "r") for path in xml_paths]
except:
  print("Something went wrong when opening one or more files.")


Obtendo campos raíz de cada arquivo e, subsequentemente, os dados dos campos RECORDNUM e ABSTRACT/EXTRACT 

In [4]:
doc_trees = [ET.parse(f) for f in fs]
doc_roots = [doc_tree.getroot() for doc_tree in doc_trees]

def get_doc_data(doc_roots):
    doc_data = {}
    for doc_root in doc_roots:
        for rec_element in doc_root.findall('RECORD'):
            rec_num = rec_element.find('RECORDNUM').text
            abstract_element = rec_element.find('ABSTRACT')
            if abstract_element is not None:
                abstract_content = abstract_element.text
            else:
                abstract_element = rec_element.find('EXTRACT')
                abstract_content = abstract_element.text if abstract_element is not None else "None"
            doc_data[rec_num] = abstract_content
    return doc_data

Colocando todos os dados de todos os arquivos num dicionário, onde a chave é RECORDNUM e o valor é o conteúdo ABSTRACT/EXTRACT

In [5]:
all_files_data = get_doc_data(doc_roots)
no_of_docs = len(all_files_data.keys())

### Pré-processando os conteúdos dos arquivos no dicionário

In [6]:
WORD_MIN_LENGTH = 2
STOP_WORDS_ENG = [stop_word.lower() for stop_word in nltk.corpus.stopwords.words('english')]

def strip_accents(text):
    nfkd = unicodedata.normalize('NFKD', text)
    stripped_text = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return re.sub('[^a-zA-Z]', ' ', stripped_text)

def tokenize(text):
    text = strip_accents(text)
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in STOP_WORDS_ENG and len(word) >= WORD_MIN_LENGTH]
    return words

def lemmatize(text):
    lemmatized_text = []
    for (word) in text:
        lemmatizing = str(WordNetLemmatizer().lemmatize(word))
        lemmatized_text.append(lemmatizing)
    return lemmatized_text

def preprocess(data):
    preprocessed_data = {}
    for rec_num, text in data.items():
        preprocessed_data[rec_num] = tokenize(text)
        preprocessed_data[rec_num] = lemmatize(preprocessed_data[rec_num])
        preprocessed_data[rec_num] = [word.upper() for word in preprocessed_data[rec_num]]
    return preprocessed_data

def get_unique_words(preprocessed_data):
    all_words = []
    for rec_num, words in preprocessed_data.items():
        for word in words:
            all_words.append(word)
    
    unique_words = nltk.FreqDist(all_words).keys()
    unique_words = [word.upper() for word in unique_words]
    return unique_words

Textos pré-processados e lista de palavras únicas entre todos os arquivos

In [7]:
all_files_data_preprocessed = preprocess(all_files_data)
unique_words_list = get_unique_words(all_files_data_preprocessed)


Calculando o número de termos em cada documento

In [8]:
n_words = {}
for rec_num, words in all_files_data_preprocessed.items():
    n_words[rec_num] = len(words)

n_words
%store n_words

Stored 'n_words' (dict)


Salvando a base de dados pré-processada para posterior uso

In [9]:
with open('./base_preprocessada.csv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    
    # Write rows
    for key, value in all_files_data_preprocessed.items():
        writer.writerow([key, value])

### GLI

In [10]:
def gli(words, preprocessed_data):
    inverted_index = {}
    for rec_num, text in preprocessed_data.items():
        for word in words:
            if word in text:
                if word not in inverted_index:
                    inverted_index[word] = []
                inverted_index[word].extend([rec_num] * text.count(word))
    return inverted_index

In [11]:
li = gli(unique_words_list, all_files_data_preprocessed)
li
%store li

Stored 'li' (dict)


Escrevendo arquivo .csv para a Lista Invertida

In [12]:
with open(li_path, 'w', newline='') as file:
    writer = csv.writer(file, delimiter=';')
    
    # Write rows
    for key, value in li.items():
        writer.writerow([key, value])