## Sentence Modality Notebook

### Imported libraries

In [1]:
import os
import urllib.request
import zipfile
import xml.etree.ElementTree as ET

import tensorflow as tf
from keras.layers import Embedding,LSTM,Dense
from keras.callbacks import TensorBoard,ModelCheckpoint,Callback
from keras import backend as K

  return f(*args, **kwds)
Using TensorFlow backend.


In [3]:
dl_PATH = './downloads/'

### Functions for downloading and unzipping modality data
All data is saved to the downloads file so as to not be uploaded to github.

In [4]:
def maybe_download(url, name):
    
    global dl_PATH
    
    if not os.path.exists(dl_PATH):
        os.makedirs(dl_PATH)
        
    if os.path.isfile(dl_PATH+name):
        print(name+' already downloaded.')
    else:
        try:
            urllib.request.urlretrieve(url, dl_PATH+name)
            print(name+' successfully downloaded.')
        except:
            print('Error downloading '+name+'.')
        
def maybe_unzip(zname):
    global dl_PATH
    
    if not os.path.isfile(dl_PATH+'task1_train_bio_abstracts_rev2.xml'):
        with zipfile.ZipFile(dl_PATH+zname, 'r') as zipref:
            zipref.extractall(dl_PATH)
    else:
        print(zname+' already unzipped.')

In [5]:
maybe_download('http://rgai.inf.u-szeged.hu/~vinczev/conll2010st/task1_train_bio_rev2.zip', 'task1_train_bio.zip')
maybe_unzip('task1_train_bio.zip') #'task1_train_bio_abstracts_rev2.xml' and 'task1_train_bio_fullarticles_rev2.xml'

task1_train_bio.zip already downloaded.
task1_train_bio.zip already unzipped.


### Class/functions definitions for handling .xml data files
The data is presented in an xml file format and is processed here into an ElementTree. The element tree behaves similarly to a nested list structure with a few extra methods.


In [6]:
class TextData(object):
    def __init__(self, xml):
        with open(dl_PATH+xml) as fd:
            self.ETree = ET.parse(fd)
            
    def totaldocNo(self):
        return(len(self.get_documents()))
        
    def totsentNo(self):
        N = 0
        for doc in self.getdocuments():
            N += len(doc[2][:])
        return(N)
    
    def get_docs(self, start=None, stop=None):
        return(self.ETree.getroot()[0][start:stop])
    
    def tosent(doc):
        return(doc[2][:])
    
    def get_sentences(self):
        sentences = []
        for doc in self.get_docs():
            for part in doc[1:]:
                for sent in part[:]:
                    sentences.append(sent)
        return(sentences)

def toString(sentElement):
    sent = sentElement.text
    ccuelen = len(sentElement.getchildren())
    if ccuelen > 0:
        for i in range(ccuelen):
            sent += sentElement[i].text
            sent += sentElement[i].tail
    return(sent)

def toStrings(sentElements):
    strings = []
    for element in sentElements:
        strings.append(toString(element))
    
def isCertain(sentElement):
    if sentElement.attrib['certainty'] == 'certain':
        return(True)
    else:
        return(False)
    
def get_cues(sentElement):
    return sentElement.getchildren()

def num_words(string):
    return len(string.replace('/',' ').split())

def cue_positions(sentElement):
    pos = 0
    positions = []
    pos += num_words(toString(sentElement))
    for cue in get_cues(sentElement)[::-1]:
        pos -= num_words(cue.tail)
        pos -= num_words(cue.text)
        positions.append(pos)
    return(positions[::-1])


In [7]:
data = TextData('task1_train_bio_abstracts_rev2.xml')    

In [8]:
print(data.get_docs(0,1)[0])


<Element 'Document' at 0x10c424d68>


In [9]:
sentences = data.get_sentences()

In [12]:
print(num_words(toString(sentences[2])))
print(toString(sentences[2]))
print(get_cues(sentences[2]))
print(cue_positions(sentences[2]))
print(isCertain(sentences[3]))

31
Electrophoretic mobility shift assays and Southwestern blotting experiments were used to detect the binding of cellular transactivation factor NF-KB to the double repeat-KB enhancer sequence located in the long terminal repeat.
[]
[]
True
