In [1]:
import configparser
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import xml.etree.ElementTree as ET
import unicodedata
import re
from tqdm import tqdm
import numpy as np

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
cfg_PC = configparser.ConfigParser()

### Reading file

In [3]:
f = open("data/cf74.xml", "r")        
doc_tree = ET.parse(f)
cf74_root = doc_tree.getroot()

def get_doc_data(doc_root):
    doc_data = {}
    for rec_element in doc_root.findall('RECORD'):
        rec_num = rec_element.find('RECORDNUM').text
        abstract_element = rec_element.find('ABSTRACT')
        if abstract_element is not None:
            abstract_content = abstract_element.text
        else:
            abstract_element = rec_element.find('EXTRACT')
            abstract_content = abstract_element.text if abstract_element is not None else "None"
        doc_data[rec_num] = abstract_content
    return doc_data


In [4]:
cf74_data = get_doc_data(cf74_root)
cf74_data

{'00001 ': 'The significance of Pseudomonas aeruginosa infection in the\nrespiratory tract of 9 cystic fibrosis patients have been studied\nby means of immunoelectrophoretical analysis of patients\' sera for\nthe number of precipitins against Pseudomonas aeruginosa and the\nconcentrations of 16 serum proteins.  In addition, the clinical and\nradiographical status of the lungs have been evaluated using 2\nscoring systems.  Precipitins against Pseudomonas aeruginosa were\ndemonstrated in all sera, the maximum number in one serum was 22.\nThe concentrations of 12 of the serum proteins were significantly\nchanged compared with matched control persons.  Notably IgG and IgA\nwere elevated and the "acute phase proteins" were changed, the\nlatter suggesting active tissue damage.  The concentrations of 3 of\nthe acute phase proteins, notably haptoglobin, were correlated to\nthe number of precipitins suggesting that the respiratory tract\ninfection in patients with many precipitins is accompanie

### Pre-processing texts

In [5]:
WORD_MIN_LENGTH = 2
STOP_WORDS_ENG = [stop_word.lower() for stop_word in nltk.corpus.stopwords.words('english')]

def strip_accents(text):
    nfkd = unicodedata.normalize('NFKD', text)
    stripped_text = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return re.sub('[^a-zA-Z]', ' ', stripped_text)

def tokenize(text):
    text = strip_accents(text)
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in STOP_WORDS_ENG and len(word) >= WORD_MIN_LENGTH]
    return words

def lemmatize(text):
    lemmatized_text = []
    for (word) in text:
        lemmatizing = str(WordNetLemmatizer().lemmatize(word))
        lemmatized_text.append(lemmatizing)
    return lemmatized_text

def get_unique_words(preprocessed_data):
    all_words = []
    for rec_num, words in preprocessed_data.items():
        for word in words:
            all_words.append(word)
    
    unique_words = nltk.FreqDist(all_words).keys()
    unique_words = [word.upper() for word in unique_words]
    return unique_words

In [6]:
def preprocess(data):
    preprocessed_data = {}
    for rec_num, text in data.items():
        preprocessed_data[rec_num] = tokenize(text)
        preprocessed_data[rec_num] = lemmatize(preprocessed_data[rec_num])
        preprocessed_data[rec_num] = [word.upper() for word in preprocessed_data[rec_num]]
    return preprocessed_data

pp_data = preprocess(cf74_data)

###### Lista de palavras encontradas nos textos

In [7]:
unique_words_list = get_unique_words(pp_data)

In [8]:
pp_data

{'00001 ': ['SIGNIFICANCE',
  'PSEUDOMONAS',
  'AERUGINOSA',
  'INFECTION',
  'RESPIRATORY',
  'TRACT',
  'CYSTIC',
  'FIBROSIS',
  'PATIENT',
  'STUDIED',
  'MEAN',
  'IMMUNOELECTROPHORETICAL',
  'ANALYSIS',
  'PATIENT',
  'SERUM',
  'NUMBER',
  'PRECIPITIN',
  'PSEUDOMONAS',
  'AERUGINOSA',
  'CONCENTRATION',
  'SERUM',
  'PROTEIN',
  'ADDITION',
  'CLINICAL',
  'RADIOGRAPHICAL',
  'STATUS',
  'LUNG',
  'EVALUATED',
  'USING',
  'SCORING',
  'SYSTEM',
  'PRECIPITIN',
  'PSEUDOMONAS',
  'AERUGINOSA',
  'DEMONSTRATED',
  'SERUM',
  'MAXIMUM',
  'NUMBER',
  'ONE',
  'SERUM',
  'CONCENTRATION',
  'SERUM',
  'PROTEIN',
  'SIGNIFICANTLY',
  'CHANGED',
  'COMPARED',
  'MATCHED',
  'CONTROL',
  'PERSON',
  'NOTABLY',
  'IGG',
  'IGA',
  'ELEVATED',
  'ACUTE',
  'PHASE',
  'PROTEIN',
  'CHANGED',
  'LATTER',
  'SUGGESTING',
  'ACTIVE',
  'TISSUE',
  'DAMAGE',
  'CONCENTRATION',
  'ACUTE',
  'PHASE',
  'PROTEIN',
  'NOTABLY',
  'HAPTOGLOBIN',
  'CORRELATED',
  'NUMBER',
  'PRECIPITIN',
  'SUGG

### Building Inverted Index

In [9]:
def gli(words, preprocessed_data):
    inverted_index = {}
    for rec_num, text in preprocessed_data.items():
        for word in words:
            if word in text:
                if word not in inverted_index:
                    inverted_index[word] = []
                inverted_index[word].extend([rec_num] * text.count(word))
    return inverted_index


In [10]:
unique_words_list

['SIGNIFICANCE',
 'PSEUDOMONAS',
 'AERUGINOSA',
 'INFECTION',
 'RESPIRATORY',
 'TRACT',
 'CYSTIC',
 'FIBROSIS',
 'PATIENT',
 'STUDIED',
 'MEAN',
 'IMMUNOELECTROPHORETICAL',
 'ANALYSIS',
 'SERUM',
 'NUMBER',
 'PRECIPITIN',
 'CONCENTRATION',
 'PROTEIN',
 'ADDITION',
 'CLINICAL',
 'RADIOGRAPHICAL',
 'STATUS',
 'LUNG',
 'EVALUATED',
 'USING',
 'SCORING',
 'SYSTEM',
 'DEMONSTRATED',
 'MAXIMUM',
 'ONE',
 'SIGNIFICANTLY',
 'CHANGED',
 'COMPARED',
 'MATCHED',
 'CONTROL',
 'PERSON',
 'NOTABLY',
 'IGG',
 'IGA',
 'ELEVATED',
 'ACUTE',
 'PHASE',
 'LATTER',
 'SUGGESTING',
 'ACTIVE',
 'TISSUE',
 'DAMAGE',
 'HAPTOGLOBIN',
 'CORRELATED',
 'MANY',
 'ACCOMPANIED',
 'RESULT',
 'INDICATE',
 'PROTECTIVE',
 'VALUE',
 'SALIVARY',
 'AMYLASE',
 'LEVEL',
 'DETERMINED',
 'NORMAL',
 'SUBJECT',
 'BIRTH',
 'ADULT',
 'LIFE',
 'CHILD',
 'CONDITION',
 'SOMETIMES',
 'ASSOCIATED',
 'LOW',
 'PANCREATIC',
 'MALNUTRITION',
 'COELIAC',
 'DISEASE',
 'MIXED',
 'SALIVA',
 'COLLECTED',
 'CAREFULLY',
 'STANDARDISED',
 'MEASURED'

In [11]:
teste = gli(unique_words_list, pp_data)
print(teste)

{'SIGNIFICANCE': ['00001 ', '00074 ', '00078 ', '00121 ', '00147 ', '00157 '], 'PSEUDOMONAS': ['00001 ', '00001 ', '00001 ', '00007 ', '00008 ', '00018 ', '00018 ', '00061 ', '00061 ', '00062 ', '00062 ', '00062 ', '00062 ', '00062 ', '00079 ', '00080 ', '00081 ', '00081 ', '00081 ', '00081 ', '00082 ', '00082 ', '00082 ', '00123 ', '00123 ', '00123 ', '00135 ', '00148 ', '00152 ', '00159 ', '00160 ', '00161 '], 'AERUGINOSA': ['00001 ', '00001 ', '00001 ', '00006 ', '00006 ', '00006 ', '00006 ', '00006 ', '00006 ', '00007 ', '00007 ', '00007 ', '00007 ', '00007 ', '00007 ', '00008 ', '00008 ', '00008 ', '00008 ', '00008 ', '00018 ', '00018 ', '00062 ', '00079 ', '00080 ', '00080 ', '00081 ', '00110 ', '00123 ', '00123 ', '00135 ', '00148 ', '00159 ', '00159 ', '00159 ', '00160 ', '00161 ', '00161 ', '00161 '], 'INFECTION': ['00001 ', '00001 ', '00001 ', '00006 ', '00006 ', '00006 ', '00016 ', '00018 ', '00048 ', '00048 ', '00057 ', '00058 ', '00058 ', '00061 ', '00062 ', '00062 ', '000

### Creating Term-Document matrix

In [12]:
def term_document_matrix(inverted_index):
    tdm = {}
    for word in inverted_index.keys():
        tdm[word] = len([rec_num for rec_num in inverted_index[word]])
    return tdm

In [13]:
teste_tdm = term_document_matrix(teste)

teste_tdm

{'SIGNIFICANCE': 6,
 'PSEUDOMONAS': 32,
 'AERUGINOSA': 39,
 'INFECTION': 40,
 'RESPIRATORY': 33,
 'TRACT': 20,
 'CYSTIC': 255,
 'FIBROSIS': 256,
 'PATIENT': 274,
 'STUDIED': 22,
 'MEAN': 35,
 'IMMUNOELECTROPHORETICAL': 1,
 'ANALYSIS': 13,
 'SERUM': 103,
 'NUMBER': 37,
 'PRECIPITIN': 16,
 'CONCENTRATION': 46,
 'PROTEIN': 36,
 'ADDITION': 8,
 'CLINICAL': 36,
 'RADIOGRAPHICAL': 1,
 'STATUS': 6,
 'LUNG': 37,
 'EVALUATED': 9,
 'USING': 20,
 'SCORING': 2,
 'SYSTEM': 10,
 'DEMONSTRATED': 9,
 'MAXIMUM': 8,
 'ONE': 39,
 'SIGNIFICANTLY': 24,
 'CHANGED': 2,
 'COMPARED': 9,
 'MATCHED': 5,
 'CONTROL': 68,
 'PERSON': 4,
 'NOTABLY': 2,
 'IGG': 10,
 'IGA': 8,
 'ELEVATED': 16,
 'ACUTE': 6,
 'PHASE': 2,
 'LATTER': 5,
 'SUGGESTING': 5,
 'ACTIVE': 4,
 'TISSUE': 12,
 'DAMAGE': 3,
 'HAPTOGLOBIN': 1,
 'CORRELATED': 13,
 'MANY': 18,
 'ACCOMPANIED': 4,
 'RESULT': 35,
 'INDICATE': 5,
 'PROTECTIVE': 2,
 'VALUE': 29,
 'SALIVARY': 15,
 'AMYLASE': 9,
 'LEVEL': 49,
 'DETERMINED': 9,
 'NORMAL': 117,
 'SUBJECT': 31,
 

#### TF-IDF weights

In [14]:
from typing import Counter

def tf_idf(preprocessed_data, tdm, type_of_tf='relative'):
    tf_idf = {}
    N = len(preprocessed_data)
    for rec_num, tokens in tqdm(preprocessed_data.items()):
        counter = Counter(tokens)
        words_count = len(tokens)
    
        for token in np.unique(tokens):
            # calculate tf
            if type_of_tf == 'relative':
                tf = counter[token]
                tf = tf / words_count
            else:
                tf = counter[token]
                tf = 1 + np.log(tf)
            
            # calculate idf
            if token in tdm:
                df = tdm[token]
            else:
                df = 0
            idf = 1 + np.log((N)/(df))
            
            # Calculate tf-idf        
            tf_idf[rec_num, token] = tf*idf
    return tf_idf

In [15]:
tfidf = tf_idf(pp_data, teste_tdm)
tfidf

100%|██████████| 167/167 [00:00<00:00, 7575.70it/s]


{('00001 ', 'ACCOMPANIED'): 0.0508784887236222,
 ('00001 ', 'ACTIVE'): 0.0508784887236222,
 ('00001 ', 'ACUTE'): 0.0930372977029828,
 ('00001 ', 'ADDITION'): 0.04342529323373032,
 ('00001 ', 'AERUGINOSA'): 0.07917523117055189,
 ('00001 ', 'ANALYSIS'): 0.038204779085539986,
 ('00001 ', 'CHANGED'): 0.11666336842702818,
 ('00001 ', 'CLINICAL'): 0.027252417999576832,
 ('00001 ', 'COMPARED'): 0.042158808979360606,
 ('00001 ', 'CONCENTRATION'): 0.07385007793315033,
 ('00001 ', 'CONTROL'): 0.020413829110114502,
 ('00001 ', 'CORRELATED'): 0.038204779085539986,
 ('00001 ', 'CYSTIC'): 0.006201400723207839,
 ('00001 ', 'DAMAGE'): 0.10794368868276658,
 ('00001 ', 'DEMONSTRATED'): 0.042158808979360606,
 ('00001 ', 'ELEVATED'): 0.03597209774383843,
 ('00001 ', 'EVALUATED'): 0.042158808979360606,
 ('00001 ', 'FIBROSIS'): 0.00615931578427089,
 ('00001 ', 'HAPTOGLOBIN'): 0.06578487970340598,
 ('00001 ', 'IGA'): 0.04342529323373032,
 ('00001 ', 'IGG'): 0.04102590020884634,
 ('00001 ', 'IMMUNOELECTROPHOR