In [11]:
import configparser
import nltk
from nltk.tokenize import word_tokenize
import xml.etree.ElementTree as ET
import unicodedata
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Reading file

In [12]:
f = open("data/cf74.xml", "r")        
doc_tree = ET.parse(f)
cf74_root = doc_tree.getroot()

def get_doc_data(doc_root):
    doc_data = {}
    for rec_element in doc_root.findall('RECORD'):
        rec_num = rec_element.find('RECORDNUM').text
        abstract_element = rec_element.find('ABSTRACT')
        if abstract_element is not None:
            abstract_content = abstract_element.text
        else:
            abstract_element = rec_element.find('EXTRACT')
            abstract_content = abstract_element.text if abstract_element is not None else "None"
        doc_data[rec_num] = abstract_content
    return doc_data


In [25]:
cf74_data = get_doc_data(cf74_root)
cf74_data['00037 ']

'Neonatal peritonitis remains a dangerous condition.  Factors\ncomplicating diagnosis and management include its low incidence, the\ndiversity of aetiological factors found, and the severity of\nunderlying or associated disease.  28 cases presenting since 1957 at\nSouthmead General Hospital were reviewed.  Thirteen aetiological\nfactors relating to mother or baby were found.  13 infants were\npremature, 10 had severe respiratory problems at birth.  Maternal\nhydramnios was present in 7.  6 babies had had exchange transfusion.\nThe commonest findings were gangrenous volvulus and cystic fibrosis.\nCombined factors were often present.  The usual presentation is with\nbilious vomiting, abdominal distension, and failure to pass normal\nmeconium.  4 cases had distended abdomens at birth.  Plain abdominal\nx-rays are of great diagnostic value.  Pneumoperitoneum is diagnostic\nbut absent from early films.  Treatment is surgical.  Close liaison\nwith the paediatrician who sees the baby first is

### Pre-processing texts

In [14]:
WORD_MIN_LENGTH = 2
STOP_WORDS_ENG = nltk.corpus.stopwords.words('english')

def strip_accents(text):
    nfkd = unicodedata.normalize('NFKD', text)
    stripped_text = u"".join([c for c in nfkd if not unicodedata.combining(c)])
    return re.sub('[^a-zA-Z0-9 \\\']', ' ', stripped_text)

def tokenize(text):
    text = strip_accents(text)
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.upper() for word in words]
    words = [word for word in words if word not in STOP_WORDS_ENG and len(word) >= WORD_MIN_LENGTH]
    return words

In [20]:
cf74_data.items()

dict_items([('00001 ', ['THE', 'SIGNIFICANCE', 'OF', 'PSEUDOMONAS', 'AERUGINOSA', 'INFECTION', 'IN', 'THE', 'RESPIRATORY', 'TRACT', 'OF', 'CYSTIC', 'FIBROSIS', 'PATIENTS', 'HAVE', 'BEEN', 'STUDIED', 'BY', 'MEANS', 'OF', 'IMMUNOELECTROPHORETICAL', 'ANALYSIS', 'OF', 'PATIENTS', 'SERA', 'FOR', 'THE', 'NUMBER', 'OF', 'PRECIPITINS', 'AGAINST', 'PSEUDOMONAS', 'AERUGINOSA', 'AND', 'THE', 'CONCENTRATIONS', 'OF', '16', 'SERUM', 'PROTEINS', 'IN', 'ADDITION', 'THE', 'CLINICAL', 'AND', 'RADIOGRAPHICAL', 'STATUS', 'OF', 'THE', 'LUNGS', 'HAVE', 'BEEN', 'EVALUATED', 'USING', 'SCORING', 'SYSTEMS', 'PRECIPITINS', 'AGAINST', 'PSEUDOMONAS', 'AERUGINOSA', 'WERE', 'DEMONSTRATED', 'IN', 'ALL', 'SERA', 'THE', 'MAXIMUM', 'NUMBER', 'IN', 'ONE', 'SERUM', 'WAS', '22', 'THE', 'CONCENTRATIONS', 'OF', '12', 'OF', 'THE', 'SERUM', 'PROTEINS', 'WERE', 'SIGNIFICANTLY', 'CHANGED', 'COMPARED', 'WITH', 'MATCHED', 'CONTROL', 'PERSONS', 'NOTABLY', 'IGG', 'AND', 'IGA', 'WERE', 'ELEVATED', 'AND', 'THE', 'ACUTE', 'PHASE', 'PRO

In [19]:
def preprocess(data):
    for rec_num, text in data.items():
        data[rec_num] = tokenize(text)
    return data

pp_test = preprocess(cf74_data)
pp_test

TypeError: normalize() argument 2 must be str, not None

In [17]:
def remove_stop_words(text, stop_words):
    frases = []
    for (words, emotion) in text:
        semstop = [p for p in words.split() if p not in stop_words]

        frases.append((semstop, emotion))

    return frases

def stem(text, ):
    stemmer = nltk.stem.RSLPStemmer()
    stemmed_phrases = []
    for (words, emotion) in text:
        with_stemming = [str(stemmer.stem(p)) for p in words.split() if p not in stop_words]
        stemmed_phrases.append((with_stemming, emotion))
    return stemmed_phrases

In [18]:
cfg_PC = configparser.ConfigParser()
