In [127]:
import configparser
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import xml.etree.ElementTree as ET
import unicodedata
import re
from tqdm import tqdm
import numpy as np

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Reading config file

In [128]:
config = configparser.ConfigParser()
config.read('GLI.cfg')

xml_paths = [path.strip() for path in config.get('DEFAULT', 'LEIA').split(',')]
li_path = config.get('DEFAULT', 'ESCREVA')

Opening files in paths recovered from config file

In [129]:
try:
    fs = [open(path, "r") for path in xml_paths]
except:
  print("Something went wrong when opening one or more files.")


In [130]:
doc_trees = [ET.parse(f) for f in fs]
doc_roots = [doc_tree.getroot() for doc_tree in doc_trees]

def get_doc_data(doc_roots):
    doc_data = {}
    for doc_root in doc_roots:
        for rec_element in doc_root.findall('RECORD'):
            rec_num = rec_element.find('RECORDNUM').text
            abstract_element = rec_element.find('ABSTRACT')
            if abstract_element is not None:
                abstract_content = abstract_element.text
            else:
                abstract_element = rec_element.find('EXTRACT')
                abstract_content = abstract_element.text if abstract_element is not None else "None"
            doc_data[rec_num] = abstract_content
    return doc_data


Dictionary in which keys are doc's Record Numbers and values are doc's Abstract or Extract content

In [131]:
all_files_data = get_doc_data(doc_roots)

### Pre-processing texts

In [132]:
WORD_MIN_LENGTH = 2
STOP_WORDS_ENG = [stop_word.lower() for stop_word in nltk.corpus.stopwords.words('english')]

def strip_accents(text):
    nfkd = unicodedata.normalize('NFKD', text)
    stripped_text = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return re.sub('[^a-zA-Z]', ' ', stripped_text)

def tokenize(text):
    text = strip_accents(text)
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in STOP_WORDS_ENG and len(word) >= WORD_MIN_LENGTH]
    return words

def lemmatize(text):
    lemmatized_text = []
    for (word) in text:
        lemmatizing = str(WordNetLemmatizer().lemmatize(word))
        lemmatized_text.append(lemmatizing)
    return lemmatized_text

def get_unique_words(preprocessed_data):
    all_words = []
    for rec_num, words in preprocessed_data.items():
        for word in words:
            all_words.append(word)
    
    unique_words = nltk.FreqDist(all_words).keys()
    unique_words = [word.upper() for word in unique_words]
    return unique_words

In [133]:
def preprocess(data):
    preprocessed_data = {}
    for rec_num, text in data.items():
        preprocessed_data[rec_num] = tokenize(text)
        preprocessed_data[rec_num] = lemmatize(preprocessed_data[rec_num])
        preprocessed_data[rec_num] = [word.upper() for word in preprocessed_data[rec_num]]
    return preprocessed_data



In [134]:
all_files_data_preprocessed = preprocess(all_files_data)
unique_words_list = get_unique_words(all_files_data_preprocessed)

###### Lista de palavras encontradas nos textos

### Building Inverted Index

In [135]:
def gli(words, preprocessed_data):
    inverted_index = {}
    for rec_num, text in preprocessed_data.items():
        for word in words:
            if word in text:
                if word not in inverted_index:
                    inverted_index[word] = []
                inverted_index[word].extend([rec_num] * text.count(word))
    return inverted_index


In [143]:
teste = gli(unique_words_list, all_files_data_preprocessed)
teste

{'SIGNIFICANCE': ['00001 ',
  '00074 ',
  '00078 ',
  '00121 ',
  '00147 ',
  '00157 ',
  '00179',
  '00185',
  '00195',
  '00205',
  '00223',
  '00258',
  '00319',
  '00355',
  '00402 ',
  '00411 ',
  '00412 ',
  '00485 ',
  '00526 ',
  '00555 ',
  '00593 ',
  '00631 ',
  '00809 ',
  '00858 ',
  '00923 ',
  '01151 '],
 'PSEUDOMONAS': ['00001 ',
  '00001 ',
  '00001 ',
  '00007 ',
  '00008 ',
  '00018 ',
  '00018 ',
  '00061 ',
  '00061 ',
  '00062 ',
  '00062 ',
  '00062 ',
  '00062 ',
  '00062 ',
  '00079 ',
  '00080 ',
  '00081 ',
  '00081 ',
  '00081 ',
  '00081 ',
  '00082 ',
  '00082 ',
  '00082 ',
  '00123 ',
  '00123 ',
  '00123 ',
  '00135 ',
  '00148 ',
  '00152 ',
  '00159 ',
  '00160 ',
  '00161 ',
  '00176',
  '00177',
  '00177',
  '00178',
  '00188',
  '00188',
  '00188',
  '00188',
  '00200',
  '00200',
  '00250',
  '00260',
  '00265',
  '00271',
  '00282',
  '00282',
  '00384 ',
  '00384 ',
  '00394 ',
  '00427 ',
  '00427 ',
  '00447 ',
  '00458 ',
  '00459 ',
  '00460

### Creating Term-Document matrix

In [137]:
def term_document_matrix(inverted_index):
    tdm = {}
    for word in inverted_index.keys():
        tdm[word] = len([rec_num for rec_num in inverted_index[word]])
    return tdm

In [138]:
teste_tdm = term_document_matrix(teste)

teste_tdm

{'SIGNIFICANCE': 26,
 'PSEUDOMONAS': 124,
 'AERUGINOSA': 194,
 'INFECTION': 225,
 'RESPIRATORY': 149,
 'TRACT': 73,
 'CYSTIC': 2042,
 'FIBROSIS': 2058,
 'PATIENT': 2168,
 'STUDIED': 153,
 'MEAN': 218,
 'IMMUNOELECTROPHORETICAL': 1,
 'ANALYSIS': 96,
 'SERUM': 778,
 'NUMBER': 150,
 'PRECIPITIN': 86,
 'CONCENTRATION': 316,
 'PROTEIN': 290,
 'ADDITION': 65,
 'CLINICAL': 274,
 'RADIOGRAPHICAL': 1,
 'STATUS': 42,
 'LUNG': 249,
 'EVALUATED': 43,
 'USING': 124,
 'SCORING': 5,
 'SYSTEM': 110,
 'DEMONSTRATED': 81,
 'MAXIMUM': 27,
 'ONE': 330,
 'SIGNIFICANTLY': 177,
 'CHANGED': 7,
 'COMPARED': 192,
 'MATCHED': 40,
 'CONTROL': 568,
 'PERSON': 33,
 'NOTABLY': 5,
 'IGG': 86,
 'IGA': 37,
 'ELEVATED': 90,
 'ACUTE': 62,
 'PHASE': 45,
 'LATTER': 25,
 'SUGGESTING': 24,
 'ACTIVE': 42,
 'TISSUE': 83,
 'DAMAGE': 26,
 'HAPTOGLOBIN': 5,
 'CORRELATED': 52,
 'MANY': 94,
 'ACCOMPANIED': 14,
 'RESULT': 332,
 'INDICATE': 54,
 'PROTECTIVE': 7,
 'VALUE': 248,
 'SALIVARY': 75,
 'AMYLASE': 94,
 'LEVEL': 337,
 'DETERMI

#### TF-IDF weights

In [139]:
len(all_files_data_preprocessed)

1239

In [140]:
from typing import Counter

def tf_idf(preprocessed_data, tdm, type_of_tf='relative'):
    tf_idf = {}
    N = len(preprocessed_data)
    for rec_num, tokens in tqdm(preprocessed_data.items()):
        counter = Counter(tokens)
        words_count = len(tokens)
    
        for token in np.unique(tokens):
            # calculate tf
            if type_of_tf == 'relative':
                tf = counter[token]
                tf = tf / words_count
            else:
                tf = counter[token]
                tf = 1 + np.log(tf)
            
            # calculate idf
            if token in tdm:
                df = tdm[token]
            else:
                df = 0
            idf = 1 + np.log((N)/(df))
            
            # Calculate tf-idf        
            tf_idf[rec_num, token] = tf*idf
    return tf_idf

In [141]:
tfidf = tf_idf(all_files_data_preprocessed, teste_tdm)
tfidf

100%|██████████| 1239/1239 [00:00<00:00, 5829.70it/s]


{('00001 ', 'ACCOMPANIED'): 0.05895701668832133,
 ('00001 ', 'ACTIVE'): 0.04714398132629865,
 ('00001 ', 'ACUTE'): 0.08591237627062476,
 ('00001 ', 'ADDITION'): 0.04244809259928501,
 ('00001 ', 'AERUGINOSA'): 0.09207102330857464,
 ('00001 ', 'ANALYSIS'): 0.03825496441033663,
 ('00001 ', 'CHANGED'): 0.13282042435642644,
 ('00001 ', 'CLINICAL'): 0.026977761024097553,
 ('00001 ', 'COMPARED'): 0.03080176892044474,
 ('00001 ', 'CONCENTRATION'): 0.07633282800136226,
 ('00001 ', 'CONTROL'): 0.019139123257075175,
 ('00001 ', 'CORRELATED'): 0.04484748562416899,
 ('00001 ', 'CYSTIC'): 0.005380375084994965,
 ('00001 ', 'DAMAGE'): 0.10460136222812175,
 ('00001 ', 'DEMONSTRATED'): 0.040081835773728,
 ('00001 ', 'ELEVATED'): 0.038948927003213736,
 ('00001 ', 'EVALUATED'): 0.04689096522511376,
 ('00001 ', 'FIBROSIS'): 0.005296451239087608,
 ('00001 ', 'HAPTOGLOBIN'): 0.070028193217151,
 ('00001 ', 'IGA'): 0.048506902892310945,
 ('00001 ', 'IGG'): 0.03943776973522188,
 ('00001 ', 'IMMUNOELECTROPHORETI