In [29]:
import configparser
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import xml.etree.ElementTree as ET
import unicodedata
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Reading file

In [30]:
f = open("data/cf74.xml", "r")        
doc_tree = ET.parse(f)
cf74_root = doc_tree.getroot()

def get_doc_data(doc_root):
    doc_data = {}
    for rec_element in doc_root.findall('RECORD'):
        rec_num = rec_element.find('RECORDNUM').text
        abstract_element = rec_element.find('ABSTRACT')
        if abstract_element is not None:
            abstract_content = abstract_element.text
        else:
            abstract_element = rec_element.find('EXTRACT')
            abstract_content = abstract_element.text if abstract_element is not None else "None"
        doc_data[rec_num] = abstract_content
    return doc_data


In [31]:
cf74_data = get_doc_data(cf74_root)
cf74_data['00037 ']

'Neonatal peritonitis remains a dangerous condition.  Factors\ncomplicating diagnosis and management include its low incidence, the\ndiversity of aetiological factors found, and the severity of\nunderlying or associated disease.  28 cases presenting since 1957 at\nSouthmead General Hospital were reviewed.  Thirteen aetiological\nfactors relating to mother or baby were found.  13 infants were\npremature, 10 had severe respiratory problems at birth.  Maternal\nhydramnios was present in 7.  6 babies had had exchange transfusion.\nThe commonest findings were gangrenous volvulus and cystic fibrosis.\nCombined factors were often present.  The usual presentation is with\nbilious vomiting, abdominal distension, and failure to pass normal\nmeconium.  4 cases had distended abdomens at birth.  Plain abdominal\nx-rays are of great diagnostic value.  Pneumoperitoneum is diagnostic\nbut absent from early films.  Treatment is surgical.  Close liaison\nwith the paediatrician who sees the baby first is

### Pre-processing texts

In [32]:
WORD_MIN_LENGTH = 2
STOP_WORDS_ENG = [stop_word.lower() for stop_word in nltk.corpus.stopwords.words('english')]

def strip_accents(text):
    nfkd = unicodedata.normalize('NFKD', text)
    stripped_text = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return re.sub('[^a-zA-Z0-9 \\\']', ' ', stripped_text)

def tokenize(text):
    text = strip_accents(text)
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in STOP_WORDS_ENG and len(word) >= WORD_MIN_LENGTH]
    return words

def lemmatize(text):
    lemmatized_text = []
    for (word) in text:
        lemmatizing = str(WordNetLemmatizer().lemmatize(word))
        lemmatized_text.append(lemmatizing)
    return lemmatized_text

def get_unique_words(preprocessed_data):
    all_words = []
    for rec_num, words in preprocessed_data.items():
        for word in words:
            all_words.append(word)
    
    unique_words = nltk.FreqDist(all_words).keys()
    unique_words = [word.upper() for word in unique_words]
    return unique_words

In [33]:
def preprocess(data):
    preprocessed_data = {}
    for rec_num, text in data.items():
        preprocessed_data[rec_num] = tokenize(text)
        preprocessed_data[rec_num] = lemmatize(preprocessed_data[rec_num])
        preprocessed_data[rec_num] = [word.upper() for word in preprocessed_data[rec_num]]
    return preprocessed_data

pp_data = preprocess(cf74_data)

In [34]:
unique_words_list = get_unique_words(pp_data)

In [35]:
pp_data

{'00001 ': ['SIGNIFICANCE',
  'PSEUDOMONAS',
  'AERUGINOSA',
  'INFECTION',
  'RESPIRATORY',
  'TRACT',
  'CYSTIC',
  'FIBROSIS',
  'PATIENT',
  'STUDIED',
  'MEAN',
  'IMMUNOELECTROPHORETICAL',
  'ANALYSIS',
  'PATIENT',
  'SERUM',
  'NUMBER',
  'PRECIPITIN',
  'PSEUDOMONAS',
  'AERUGINOSA',
  'CONCENTRATION',
  '16',
  'SERUM',
  'PROTEIN',
  'ADDITION',
  'CLINICAL',
  'RADIOGRAPHICAL',
  'STATUS',
  'LUNG',
  'EVALUATED',
  'USING',
  'SCORING',
  'SYSTEM',
  'PRECIPITIN',
  'PSEUDOMONAS',
  'AERUGINOSA',
  'DEMONSTRATED',
  'SERUM',
  'MAXIMUM',
  'NUMBER',
  'ONE',
  'SERUM',
  '22',
  'CONCENTRATION',
  '12',
  'SERUM',
  'PROTEIN',
  'SIGNIFICANTLY',
  'CHANGED',
  'COMPARED',
  'MATCHED',
  'CONTROL',
  'PERSON',
  'NOTABLY',
  'IGG',
  'IGA',
  'ELEVATED',
  'ACUTE',
  'PHASE',
  'PROTEIN',
  'CHANGED',
  'LATTER',
  'SUGGESTING',
  'ACTIVE',
  'TISSUE',
  'DAMAGE',
  'CONCENTRATION',
  'ACUTE',
  'PHASE',
  'PROTEIN',
  'NOTABLY',
  'HAPTOGLOBIN',
  'CORRELATED',
  'NUMBER',

### Building Inverted Index

In [36]:
def gli(words, preprocessed_data):
    inverted_index = {}
    for rec_num, text in preprocessed_data.items():
        for word in words:
            if word in text:
                if word not in inverted_index:
                    inverted_index[word] = []
                inverted_index[word].extend([rec_num] * text.count(word))
    return inverted_index


In [37]:
unique_words_list

['SIGNIFICANCE',
 'PSEUDOMONAS',
 'AERUGINOSA',
 'INFECTION',
 'RESPIRATORY',
 'TRACT',
 'CYSTIC',
 'FIBROSIS',
 'PATIENT',
 'STUDIED',
 'MEAN',
 'IMMUNOELECTROPHORETICAL',
 'ANALYSIS',
 'SERUM',
 'NUMBER',
 'PRECIPITIN',
 'CONCENTRATION',
 '16',
 'PROTEIN',
 'ADDITION',
 'CLINICAL',
 'RADIOGRAPHICAL',
 'STATUS',
 'LUNG',
 'EVALUATED',
 'USING',
 'SCORING',
 'SYSTEM',
 'DEMONSTRATED',
 'MAXIMUM',
 'ONE',
 '22',
 '12',
 'SIGNIFICANTLY',
 'CHANGED',
 'COMPARED',
 'MATCHED',
 'CONTROL',
 'PERSON',
 'NOTABLY',
 'IGG',
 'IGA',
 'ELEVATED',
 'ACUTE',
 'PHASE',
 'LATTER',
 'SUGGESTING',
 'ACTIVE',
 'TISSUE',
 'DAMAGE',
 'HAPTOGLOBIN',
 'CORRELATED',
 'MANY',
 'ACCOMPANIED',
 'RESULT',
 'INDICATE',
 'PROTECTIVE',
 'VALUE',
 'SALIVARY',
 'AMYLASE',
 'LEVEL',
 'DETERMINED',
 'NORMAL',
 'SUBJECT',
 'BIRTH',
 'ADULT',
 'LIFE',
 'CHILD',
 'CONDITION',
 'SOMETIMES',
 'ASSOCIATED',
 'LOW',
 'PANCREATIC',
 'MALNUTRITION',
 'COELIAC',
 'DISEASE',
 'MIXED',
 'SALIVA',
 'COLLECTED',
 'CAREFULLY',
 'STAND

In [38]:
teste = gli(unique_words_list, pp_data)
print(teste)

{'SIGNIFICANCE': ['00001 ', '00074 ', '00078 ', '00121 ', '00147 ', '00157 '], 'PSEUDOMONAS': ['00001 ', '00001 ', '00001 ', '00007 ', '00008 ', '00018 ', '00018 ', '00061 ', '00061 ', '00062 ', '00062 ', '00062 ', '00062 ', '00062 ', '00079 ', '00080 ', '00081 ', '00081 ', '00081 ', '00081 ', '00082 ', '00082 ', '00082 ', '00123 ', '00123 ', '00123 ', '00135 ', '00148 ', '00152 ', '00159 ', '00160 ', '00161 '], 'AERUGINOSA': ['00001 ', '00001 ', '00001 ', '00006 ', '00006 ', '00006 ', '00006 ', '00006 ', '00006 ', '00007 ', '00007 ', '00007 ', '00007 ', '00007 ', '00007 ', '00008 ', '00008 ', '00008 ', '00008 ', '00008 ', '00018 ', '00018 ', '00062 ', '00079 ', '00080 ', '00080 ', '00081 ', '00110 ', '00123 ', '00123 ', '00135 ', '00148 ', '00159 ', '00159 ', '00159 ', '00160 ', '00161 ', '00161 ', '00161 '], 'INFECTION': ['00001 ', '00001 ', '00001 ', '00006 ', '00006 ', '00006 ', '00016 ', '00018 ', '00048 ', '00048 ', '00057 ', '00058 ', '00058 ', '00061 ', '00062 ', '00062 ', '000

In [39]:
cfg_PC = configparser.ConfigParser()
