#### Read data and test for sentence borders exceeding length == 1

In [9]:
import json
import glob
from itertools import chain
from fn import F


def read_frames(fp) -> dict:
    with open(fp) as buffer:
        return json.loads(buffer.read())['frames']


titles = (F(map, read_frames) >> chain.from_iterable >> list)(glob.glob('titles/*.json'))
bodies = (F(map, read_frames) >> chain.from_iterable >> list)(glob.glob('bodies/*.json'))

In [19]:
def ismalformed(annotation):
    start, end, _ = annotation
    return end - start != 1


def has_malformed(frame: dict) -> True:
    return any(map(ismalformed, frame['anno']))


titles_malformed_borders = (F(filter, has_malformed) >> list)(titles)
bodies_malformed_borders = (F(filter, has_malformed) >> list)(bodies)

print(*map(len, [titles_malformed_borders, bodies_malformed_borders]))

0 3


In [23]:
# investigate malformed sentence borders in bodies_malformed_borders

def extract_annotations(text, annotations):
    return [text[start:stop] for start, stop, _ in annotations]

malformed_borders = [
    extract_annotations(frame['text'], filter(ismalformed, frame['anno']))
    for frame in bodies_malformed_borders
]

print(malformed_sentences)

[[': '], [': '], ['t ']]


Seems like, these hanging spaces are the only problem – we can live with that.

In [43]:
from typing import List, Tuple, Iterable
from functools import reduce
from itertools import takewhile, dropwhile
import re

WS = re.compile('\s')

def borders(text, annotations) -> List[Tuple[int, int]]:
    def spacecount(characters: Iterable[str]):
        return sum(map(bool, takewhile(lambda x: WS.match(x), characters)))
       
    def accumulate(acc: List, breakpoint):
        start = acc[-1][1] if acc else 0
        sentence = text[start:breakpoint]
        interval = (start + spacecount(sentence), 
                   start + len(sentence) - spacecount(reversed(sentence)))
        acc.append(interval)
        return acc
    
    return reduce(accumulate, (end for _, end, _ in annotations), [])


In [52]:
# i = 2
# sentences = borders(bodies_malformed_borders[i]['text'], bodies_malformed_borders[i]['anno'])
# [bodies_malformed_borders[i]['text'][start:end] for start, end in sentences]


['Abstract',
 '1. Upamostat (Mesupron®) is a new small molecule serine protease inhibitor.',
 'The drug candidate was developed to inhibit the urokinase-type plasminogen activator (uPA) system, which plays a major role in tumor invasion and metastasis.',
 'Upamostat is currently in clinical development as an anti-metastatic and non-cytotoxic agent against pancreatic and breast cancer.',
 '2. Upamostat is the orally available amidoxime- (i.e. hydroxyamidine-) prodrug of the pharmacologically active form, WX-UK1.',
 'In this study, the reductive enzymatic activation of upamostat to its corresponding amidine WX-UK1 was analyzed.',
 '3. The recently discovered molybdenum enzyme "mitochondrial Amidoxime Reducing Component" (mARC) catalyses together with its electron transport proteins cytochrome b(5) and NADH cytochrome b(5) reductase the reduction of N-hydroxylated prodrugs.',
 'In vitro biotransformation assays with porcine subcellular fractions and the reconstituted human enzymes demonst

In [56]:
title_borders = {
    frame['id']: (borders(frame['text'], frame['anno'])) for frame in titles 
}
body_borders = {
    frame['id']: (borders(frame['text'], frame['anno'])) for frame in bodies 
}

In [57]:
len(title_borders.keys() & body_borders.keys()) / len(title_borders.keys() | body_borders.keys())

1.0

In [58]:
merged = [
    (id_, title_borders[id_], body_borders[id_]) for id_ in title_borders
]

In [60]:
with open('borders_all.tsv', 'w') as out:
    for id_, title_borders, body_borders in merged:
        for title in title_borders:
            print(id_, 'T', '{}:{}'.format(*title), sep='\t', file=out)
        for body in body_borders:
            print(id_, 'A', '{}:{}'.format(*body), sep='\t', file=out)
