In [1]:
import os
import annotations
from annotations import Entity, Relation
import utils
import warnings

In [2]:
DATA_DIR = 'data/'

In [3]:
files = os.listdir('data/')
file_ids = list(set(['.'.join(name.split('.')[:-1]) for name in files]))

In [4]:
def get_annotations(file_id):
    f = open(DATA_DIR + file_id + '.ann')
    raw_data = f.read().split('\n')
    f.close()
    
    entities = {}
    relations = {}
    
    relation_backlog = []    # Relations with entities that haven't been processed yet
    
    for line in raw_data:
        line = line.split('\t')
        line = list(filter(None, line))   # Remove empty strings from list
        
        if not line or not line[0]:
            continue
        
        if line[0][0] == 'T':
            assert len(line) == 3
            
            # Find the end of first word, which is the entity type
            for idx in range(len(line[1])):
                if line[1][idx] == ' ':
                    break
            
            # Create an Entity object
            ent = Entity(entity_id = line[0], entity_type = line[1][:idx])
            char_ranges = line[1][idx + 1:]
            
            # Get all character ranges, separated by ;
            char_ranges = char_ranges.split(';')
            for r in char_ranges:
                r = r.split(' ')
                r = list(map(int, r))
                ent.add_range(r)
                
            ent.set_text(line[2])
            entities[line[0]] = ent
        
        elif line[0][0] == 'R':
            
            assert len(line) == 2
            
            rel_details = line[1].split(' ')
            entity1 = rel_details[1].split(':')[-1]
            entity2 = rel_details[2].split(':')[-1]
            
            if entity1 in entities and entity2 in entities:
                rel = Relation(relation_id = line[0], relation_type = rel_details[0], 
                               arg1 = entities[entity1], arg2 = entities[entity2])
                
                relations[line[0]] = rel
            else:
                # If the entities aren't processed yet, add them to backlog to process later
                relation_backlog.append([line[0], rel_details[0], entity1, entity2])
        
        else:
            # If the annotation is not a relation or entity, warn user
            warnings.warn("Invalid annotation encountered: " + str(line))
    
    for r in relation_backlog:
        rel = Relation(relation_id = r[0], relation_type = r[1], 
                       arg1 = entities[r[2]], arg2 = entities[r[3]])
        
        relations[r[0]] = rel
        
    return {"entities": entities, "relations": relations}

In [5]:
sample_ann = get_annotations(file_ids[0])

In [7]:
sample_ann['entities']['T1']


ID: T1
Entity name: Drug
Character ranges: 111 120
Entity text: Pronestyl

In [8]:
sample_ann['relations']['R1']


ID: R1
Relation type: Reason-Drug

Entity 1: 

ID: T10
Entity name: Reason
Character ranges: 2809 2817
Entity text: diuresis

Entity 2: 

ID: T11
Entity name: Drug
Character ranges: 2861 2866
Entity text: Lasix

In [9]:
tags = []

for idx in range(len(file_ids)):
    tags.append(get_annotations(file_ids[idx]))
    utils.drawProgressBar(idx + 1, len(file_ids))







In [10]:
utils.save_pickle('processed_data/tags', tags)

Variable successfully saved in processed_data/tags.pkl
