In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import io
import sys
from collections import Counter

In [32]:
def read_conll(filepath):
    all_toks = []
    all_cats = []
    sents = []
    with io.open(filepath, encoding='utf-8') as ip:
        sent = []
        for line in ip:
            if line == '\n':
                sents.append(sent)
                sent = []
            else:
                line = line.strip()
                token = line.split('\t')[0]
                cat = line.split('\t')[1]
                all_toks.append(token)
                all_cats.append(cat)
                sent.append((token, cat))
    return sents, all_toks, all_cats

def get_tags_ents(all_cats):
    tags = []
    ents = []
    for cat in set(all_cats):
        parts = cat.split('-')
        tags.append(parts[0])
        if len(parts) > 1:
            ents.append(parts[1])
        
    return set(tags), set(ents)

In [33]:
print('processing train file...')
train_sents, train_all_toks, train_all_cats = read_conll('../../../resources/data/NER/biomedical/train.iob2')

print('processing testing file...')
test_sents, test_all_toks, test_all_cats = read_conll('../../../resources/data/NER/biomedical/test.iob2')

processing train file...
processing testing file...


In [34]:
import random
valid_prop = 0.1
valid_size = int(valid_prop*len(train_sents))
random_indices = random.sample(range(len(train_sents)), valid_size)

In [35]:
valid_sents = [train_sents[idx] for idx in random_indices]
train_sents = np.delete(train_sents, random_indices).tolist()

In [36]:
print('validation sents: ', len(valid_sents))
print('training sentences: ', len(train_sents))

validation sents:  1854
training sentences:  16692


In [37]:
def write_conll(filepath, data):
    print('starting writing..')
    with io.open(filepath, 'w', encoding='utf-8') as fl:
        fl.write('-DOCSTART-\n')
        for sent in data:
            fl.write('\n')
            for tup in sent:
                fl.write(tup[0] + '\t' + tup[1] + '\n')
    print('finished writing..')

In [38]:
write_conll('../../../resources/data/NER/biomedical/train.iob2', train_sents)

starting writing..
finished writing..


In [39]:
write_conll('../../../resources/data/NER/biomedical/valid.iob2', valid_sents)

starting writing..
finished writing..
