In [1]:
import re
import pandas as pd
import sys
import random
from tqdm import tqdm
import spacy
from spacy.training.example import Example
from spacy.lang.en import English  # Or whichever language you need
from spacy.training import offsets_to_biluo_tags
import json
from spacy.tokens import DocBin
nlp = spacy.blank("en")
posnlp = spacy.load("en_core_web_sm")
db = DocBin()

In [2]:
# NCBI
sys.path.insert(0, 'ncbi-disease/tools/')
sys.path.insert(0, 'ncbi-disease/original-data/test/')
sys.path.insert(0, 'ncbi-disease/original-data/train/')
sys.path.insert(0, 'ncbi-disease/original-data/devel/')

from ncbidisease import load_ncbi_disease, read_ncbi_disease

In [3]:
devdata = 'ncbi-disease/original-data/devel/NCBIdevelopset_corpus.txt'
traindata = 'ncbi-disease/original-data/train/NCBItrainset_corpus.txt'
testdata = 'ncbi-disease/original-data/test/NCBItestset_corpus.txt'

In [4]:
dev_documents = load_ncbi_disease(devdata)
f_dev = dev_documents
train_documents = load_ncbi_disease(traindata)
f_train = train_documents
test_documents = load_ncbi_disease(testdata)
f_test = test_documents

In [5]:
print("dataset length: Dev, Train, Test", len(f_dev), len(f_train), len(f_test))

dataset length: Dev, Train, Test 100 593 100


In [6]:
def get_ner_input(text, annotation):
    results = []
    entities = []
    for item in annotation:
        x = item.split('\t')
        if (x[0][0] != 'N'):
            s = x[1].split(' ')
            #print(s)
            #if ((s[0] == 'DiseaseClass') or (s[0]) == 'SpecificDisease'):
            if (s[0] == 'DiseaseClass'):
                entities.append((int(s[1]), int(s[2]), s[0]))
                if len(entities) > 0:
                    results = [text, {"entities": entities}]
    return (results)

In [7]:
#
# Create dataset
#
def generate_dataset(f):    
    DATASET = []
    for d in f:
        txt = d.tiab # Single text document
        txtarr = txt.split("\n")
        text = ' '.join(txtarr)
        #print(text)
        annotation = d.to_standoff() # Annotations for that document, can contain one or more annotations
        results = get_ner_input(text, annotation)
        if len(results) == 2: # two elements in list, text + annotation
            DATASET.append(results)
    return DATASET

In [8]:
DEV_DATA = generate_dataset(f_dev)
TRAIN_DATA = generate_dataset(f_train)
TEST_DATA = generate_dataset(f_test)

In [9]:
DATA = TRAIN_DATA

In [10]:
len(DATA)

320

In [11]:
#DATA

In [12]:
def get_tags(DATA):
    
    nlp = English()
    tokens = []
    text_tags = []
    text_pos = []
    text_sent = []
    sent_offset = 0
    for text, annotations in DATA:
        offsets = annotations["entities"]
        doc = nlp(text)
        docpos = posnlp(text)
        tags = offsets_to_biluo_tags(doc, offsets)
        tokens.append([token.text for token in doc])
        text_tags.append(tags)                
        pos = [token.pos_ for token in docpos]
        text_pos.append(pos)
        
        for sent_i, sent in enumerate(docpos.sents):            
            sent_offset += 1
            text_sent.append([sent_offset] * len(sent)) # sentence number is continous
        #print([token.pos_ for token in docpos])
        #print([token.text for token in doc], tags)
        
    sentences = [item for sublist in text_sent for item in sublist]        
    return sentences, tokens, text_pos, text_tags

In [13]:
# convert list of list to list
def flatten(t):
    return [item for sublist in t for item in sublist]

In [14]:
## testing code
#test_text = [['LEF/1 in the dividing tumour cells beta-catenin/LEF', {'entities': [(22,28,'Disease')]}]]
#a = test_text[0][0]
#print(a[22:28])

In [15]:
#for text, annotations in test_text:
#    doc = nlp(text)
#    docpos = posnlp(text)
#    offsets = annotations["entities"]
#    tags = offsets_to_biluo_tags(doc, offsets)
#    pos = [token.pos_ for token in docpos]

In [16]:
sentences, text_tokens, text_pos, text_tags = get_tags(DATA)

  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,


In [17]:
len(sentences)

75861

In [18]:
tokens = flatten(text_tokens)
len(tokens)

75861

In [19]:
tags = flatten(text_tags)
len(tags)

75861

In [20]:
pos = flatten(text_pos)
len(pos)

75861

In [21]:
columns = ['sentence#', 'token', 'pos', 'tag']

In [22]:
df = pd.DataFrame(columns = columns)

In [23]:
df['sentence#'] = sentences

In [24]:
df['token'] = tokens

In [25]:
df['tag'] = tags

In [26]:
df['pos'] = pos

In [27]:
#df.loc[df['sentence#']==11]

In [28]:
df.to_csv('NCBITraining_tagged.csv', header=True, index=False)

In [29]:
f = pd.read_csv('NCBITraining_tagged.csv')

In [30]:
f.head()

Unnamed: 0,sentence#,token,pos,tag
0,1,A,DET,O
1,1,common,ADJ,O
2,1,human,ADJ,O
3,1,skin,NOUN,B-DiseaseClass
4,1,tumour,NOUN,L-DiseaseClass
