https://github.com/asmitapoddar/Clinical-Named-Entity-Recognition-for-EHR

In [1]:
import numpy as np
import random
from keras.preprocessing.sequence import pad_sequences

def readfile(filename):
    '''
    read file
    return format :
    [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
    '''
    f = open(filename)
    sentences = []
    sentence = []
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
            continue
        splits = line.split(' ')
        sentence.append([splits[0],splits[-1]])

    if len(sentence) >0:
        sentences.append(sentence)
        sentence = []
    return sentences

def getCasing(word, caseLookup):   
    casing = 'other'
    
    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1
    
    if (len(word)==0):
        word=" "    
    digitFraction = numDigits / float(len(word))
    
    if word.isdigit(): #Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower(): #All lower case
        casing = 'allLower'
    elif word.isupper(): #All upper case
        casing = 'allUpper'
    elif word[0].isupper(): #is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'
    
   
    return caseLookup[casing]
    

def createBatches(data):
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)
    batches = []
    batch_len = []
    z = 0
    for i in l:
        for batch in data:
            if len(batch[0]) == i:
                batches.append(batch)
                z += 1
        batch_len.append(z)
    return batches,batch_len

def createBatches(data):
    l = []
    for i in data:
        l.append(len(i[0]))
    l = set(l)
    batches = []
    batch_len = []
    z = 0
    for i in l:
        for batch in data:
            if len(batch[0]) == i:
                batches.append(batch)
                z += 1
        batch_len.append(z)
    return batches,batch_len

def createMatrices(sentences, word2Idx, label2Idx, case2Idx,char2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
        
    dataset = []
    
    wordCount = 0
    unknownWordCount = 0
    
    for sentence in sentences:
        wordIndices = []    
        caseIndices = []
        charIndices = []
        labelIndices = []
        
        for word,char,label in sentence:  
            wordCount += 1
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
            charIdx = []
            for x in char:
                if x=='>' or x=='<':
                    continue
                charIdx.append(char2Idx[x])
            #Get the label and map to int            
            wordIndices.append(wordIdx)
            caseIndices.append(getCasing(word, case2Idx))
            charIndices.append(charIdx)
            labelIndices.append(label2Idx[label])
           
        dataset.append([wordIndices, caseIndices, charIndices, labelIndices]) 
        
    return dataset

def iterate_minibatches(dataset,batch_len): 
    start = 0
    for i in batch_len:
        tokens = []
        caseing = []
        char = []
        labels = []
        data = dataset[start:i]
        start = i
        for dt in data:
            t,c,ch,l = dt
            l = np.expand_dims(l,-1)
            tokens.append(t)
            caseing.append(c)
            char.append(ch)
            labels.append(l)
        yield np.asarray(labels),np.asarray(tokens),np.asarray(caseing),np.asarray(char)

def addCharInformatioin(Sentences):
    for i,sentence in enumerate(Sentences):
        for j,data in enumerate(sentence):
            chars = [c for c in data[0]]
            Sentences[i][j] = [data[0],chars,data[1]]
    return Sentences

def padding(Sentences):
    maxlen = 52
    for sentence in Sentences:
        char = sentence[2]
        for x in char:
            maxlen = max(maxlen,len(x))
    for i,sentence in enumerate(Sentences):
        Sentences[i][2] = pad_sequences(Sentences[i][2],52,padding='post')
    return Sentences

In [3]:
!pip install validation

Collecting validation
  Downloading validation-0.5.0.zip (33 kB)
Building wheels for collected packages: validation
  Building wheel for validation (setup.py) ... [?25ldone
[?25h  Created wheel for validation: filename=validation-0.5.0-py3-none-any.whl size=30926 sha256=9ae4f80aaded71eb16af51d30b892a4a64b8549676f6304f2bcfa3aea8f23d13
  Stored in directory: /Users/sdeshpande/Library/Caches/pip/wheels/69/a6/4c/4f4332e9fb272c68b4f64b31a569b2e3aa3c7ded49823bafcc
Successfully built validation
Installing collected packages: validation
Successfully installed validation-0.5.0


In [5]:
def compute_f1(predictions, correct, idx2Label): 
    label_pred = []    
    for sentence in predictions:
        label_pred.append([idx2Label[element] for element in sentence])
        
    label_correct = []    
    for sentence in correct:
        label_correct.append([idx2Label[element] for element in sentence])
            
    
    #print label_pred
    #print label_correct
    
    prec = compute_precision(label_pred, label_correct)
    rec = compute_precision(label_correct, label_pred)
    
    f1 = 0
    if (rec+prec) > 0:
        f1 = 2.0 * prec * rec / (prec + rec);
        
    return prec, rec, f1

def compute_precision(guessed_sentences, correct_sentences):
    assert(len(guessed_sentences) == len(correct_sentences))
    correctCount = 0
    count = 0
    
    
    for sentenceIdx in range(len(guessed_sentences)):
        guessed = guessed_sentences[sentenceIdx]
        correct = correct_sentences[sentenceIdx]
        assert(len(guessed) == len(correct))
        idx = 0
        while idx < len(guessed):
            if guessed[idx][0] == 'B': #A new chunk starts
                count += 1
                
                if guessed[idx] == correct[idx]:
                    idx += 1
                    correctlyFound = True
                    
                    while idx < len(guessed) and guessed[idx][0] == 'I': #Scan until it no longer starts with I
                        if guessed[idx] != correct[idx]:
                            correctlyFound = False
                        
                        idx += 1
                    
                    if idx < len(guessed):
                        if correct[idx][0] == 'I': #The chunk in correct was longer
                            correctlyFound = False
                        
                    
                    if correctlyFound:
                        correctCount += 1
                else:
                    idx += 1
            else:  
                idx += 1
    
    precision = 0
    if count > 0:    
        precision = float(correctCount) / count
        
    return precision

In [17]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize

class Parser:

    def __init__(self):
        # ::Hard coded char lookup ::
        self.char2Idx = {"PADDING":0, "UNKNOWN":1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
            self.char2Idx[c] = len(self.char2Idx)
        # :: Hard coded case lookup ::
        self.case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}

    def load_models(self, loc=None):
        if not loc:
            loc = os.path.join(os.path.expanduser('~'), '.ner_model')
        self.model = load_model(os.path.join(loc,"model.h5"))
        # loading word2Idx
        self.word2Idx = np.load(os.path.join(loc,"word2Idx.npy")).item()
        # loading idx2Label
        self.idx2Label = np.load(os.path.join(loc,"idx2Label.npy")).item()

    def getCasing(self,word, caseLookup):   
        casing = 'other'
        
        numDigits = 0
        for char in word:
            if char.isdigit():
                numDigits += 1
                
        digitFraction = numDigits / float(len(word))
        
        if word.isdigit(): #Is a digit
            casing = 'numeric'
        elif digitFraction > 0.5:
            casing = 'mainly_numeric'
        elif word.islower(): #All lower case
            casing = 'allLower'
        elif word.isupper(): #All upper case
            casing = 'allUpper'
        elif word[0].isupper(): #is a title, initial char upper, then all lower
            casing = 'initialUpper'
        elif numDigits > 0:
            casing = 'contains_digit'  
        return caseLookup[casing]

    def createTensor(self,sentence, word2Idx,case2Idx,char2Idx):
        unknownIdx = word2Idx['UNKNOWN_TOKEN']
    
        wordIndices = []    
        caseIndices = []
        charIndices = []
            
        for word,char in sentence:  
            word = str(word)
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]                 
            else:
                wordIdx = unknownIdx
            charIdx = []
            for x in char:
                if x in char2Idx.keys():
                    charIdx.append(char2Idx[x])
                else:
                    charIdx.append(char2Idx['UNKNOWN'])   
            wordIndices.append(wordIdx)
            caseIndices.append(self.getCasing(word, case2Idx))
            charIndices.append(charIdx)
            
        return [wordIndices, caseIndices, charIndices]

    def addCharInformation(self, sentence):
        return [[word, list(str(word))] for word in sentence]

    def padding(self,Sentence):
        Sentence[2] = pad_sequences(Sentence[2],52,padding='post')
        return Sentence

    def predict(self,Sentence):
        Sentence = words =  word_tokenize(Sentence)
        Sentence = self.addCharInformation(Sentence)
        Sentence = self.padding(self.createTensor(Sentence,self.word2Idx,self.case2Idx,self.char2Idx))
        tokens, casing,char = Sentence
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = self.model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1)
        pred = [self.idx2Label[x].strip() for x in pred]
        return list(zip(words,pred))

In [7]:
import numpy as np 
from keras.models import Model
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate
from keras.utils import Progbar
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import RandomUniform

epochs = 10
def tag_dataset(dataset):
    correctLabels = []
    predLabels = []
    b = Progbar(len(dataset))
    for i,data in enumerate(dataset):    
        tokens, casing,char, labels = data
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1) #Predict the classes            
        correctLabels.append(labels)
        predLabels.append(pred)
        b.update(i)
    b.update(i+1)
    return predLabels, correctLabels

In [13]:
trainSentences = readfile("/Users/sdeshpande/Desktop/bioinformatices/Clinical-Named-Entity-Recognition-for-EHR/NER-with-BidirectionalLSTM-CNN/data/train.txt")
'''
filehandle = open("data/train_download.txt", 'r')
tset = []
sentences = []
i = 0
while True:
    # read a single line
    i = i + 1
    line = filehandle.readline()
    if not line:
        break
    line_split = line.split()
    #print(line_split[0], line_split[3])
    if len(line_split)>=4:
        tset.append([line_split[0], line_split[3]])
    if i%10==0:
        sentences.append(tset)
        tset= []
print("Asmita\n", sentences)
filehandle.close()
trainSentences = sentences
'''
#trainSentences = readfile("data/train_download.txt")
print(trainSentences)
devSentences = readfile("/Users/sdeshpande/Desktop/bioinformatices/Clinical-Named-Entity-Recognition-for-EHR/NER-with-BidirectionalLSTM-CNN/data/valid.txt")
print(devSentences)
testSentences = readfile("/Users/sdeshpande/Desktop/bioinformatices/Clinical-Named-Entity-Recognition-for-EHR/NER-with-BidirectionalLSTM-CNN/data/test.txt")
print(testSentences)

[[['it', 'O\n'], ['was', 'O\n'], ['later', 'O\n'], ['believed', 'O\n'], ['that', 'O\n'], ['this', 'O\n'], ['episode', 'treatment\n'], ['was', 'O\n'], ['similar', 'O\n'], ['to', 'O\n'], ['other', 'O\n'], ['***384***s', 'O\n'], ['of', 'O\n'], ['shortness', 'problem\n'], ['of', 'O\n'], ['breath', 'O\n'], ['that', 'O\n'], ['you', 'O\n'], ['have', 'O\n'], ['experienced', 'O\n'], ['.', 'O\n'], ['those', 'O\n'], ['experiences', 'O\n'], ['were', 'O\n'], ['due', 'O\n'], ['to', 'O\n'], ['your', 'O\n'], ['heart', 'O\n'], ['being', 'O\n'], ['unable', 'O\n'], ['to', 'O\n'], ['beat', 'O\n'], ['strongly', 'O\n'], ['enough', 'O\n'], ['through', 'O\n'], ['your', 'O\n'], ['narrowed', 'O\n'], ['valve', 'O\n'], ['and', 'O\n'], ['your', 'O\n'], ['body', 'O\n'], ['having', 'O\n'], ['an', 'O\n'], ['excess', 'O\n'], ['of', 'O\n'], ['***1082***,', 'O\n'], ['which', 'O\n'], ['backed', 'O\n'], ['up', 'O\n'], ['into', 'O\n'], ['your', 'O\n'], ['***865***s', 'O\n'], ['.', 'O\n'], ['as', 'O\n'], ['a', 'O\n'], ['res

In [14]:
trainSentences = addCharInformatioin(trainSentences)
devSentences = addCharInformatioin(devSentences)
testSentences = addCharInformatioin(testSentences)

print(trainSentences)
print(devSentences)
print(testSentences)

[[['it', ['i', 't'], 'O\n'], ['was', ['w', 'a', 's'], 'O\n'], ['later', ['l', 'a', 't', 'e', 'r'], 'O\n'], ['believed', ['b', 'e', 'l', 'i', 'e', 'v', 'e', 'd'], 'O\n'], ['that', ['t', 'h', 'a', 't'], 'O\n'], ['this', ['t', 'h', 'i', 's'], 'O\n'], ['episode', ['e', 'p', 'i', 's', 'o', 'd', 'e'], 'treatment\n'], ['was', ['w', 'a', 's'], 'O\n'], ['similar', ['s', 'i', 'm', 'i', 'l', 'a', 'r'], 'O\n'], ['to', ['t', 'o'], 'O\n'], ['other', ['o', 't', 'h', 'e', 'r'], 'O\n'], ['***384***s', ['*', '*', '*', '3', '8', '4', '*', '*', '*', 's'], 'O\n'], ['of', ['o', 'f'], 'O\n'], ['shortness', ['s', 'h', 'o', 'r', 't', 'n', 'e', 's', 's'], 'problem\n'], ['of', ['o', 'f'], 'O\n'], ['breath', ['b', 'r', 'e', 'a', 't', 'h'], 'O\n'], ['that', ['t', 'h', 'a', 't'], 'O\n'], ['you', ['y', 'o', 'u'], 'O\n'], ['have', ['h', 'a', 'v', 'e'], 'O\n'], ['experienced', ['e', 'x', 'p', 'e', 'r', 'i', 'e', 'n', 'c', 'e', 'd'], 'O\n'], ['.', ['.'], 'O\n'], ['those', ['t', 'h', 'o', 's', 'e'], 'O\n'], ['experience

In [16]:
labelSet = set()
words = {}

for dataset in [trainSentences, devSentences, testSentences]:
    for sentence in dataset:
        for token,char,label in sentence:
            labelSet.add(label)
            words[token.lower()] = True

# :: Create a mapping for the labels ::
label2Idx = {}
for label in labelSet:
    label2Idx[label] = len(label2Idx)
print(label2Idx)

# :: Hard coded case lookup ::
case2Idx = {'numeric': 0, 'allLower':1, 'allUpper':2, 'initialUpper':3, 'other':4, 'mainly_numeric':5, 'contains_digit': 6, 'PADDING_TOKEN':7}
caseEmbeddings = np.identity(len(case2Idx), dtype='float32')


# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []

{'test\n': 0, 'problem\n': 1, 'B-NP\n': 2, 'O\n': 3, 'treatment\n': 4}


In [18]:
fEmbeddings = open("/Users/sdeshpande/Desktop/bioinformatices/Clinical-Named-Entity-Recognition-for-EHR/NER-with-BidirectionalLSTM-CNN/embeddings/glove.6B.100d.txt", encoding="utf-8")

In [20]:
for line in fEmbeddings:
    split = line.strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if split[0].lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[split[0]] = len(word2Idx)
        
wordEmbeddings = np.array(wordEmbeddings)

char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)

train_set = padding(createMatrices(trainSentences,word2Idx,  label2Idx, case2Idx,char2Idx))
dev_set = padding(createMatrices(devSentences,word2Idx, label2Idx, case2Idx,char2Idx))
test_set = padding(createMatrices(testSentences, word2Idx, label2Idx, case2Idx,char2Idx))

idx2Label = {v: k for k, v in label2Idx.items()}
np.save("/Users/sdeshpande/Desktop/bioinformatices/Clinical-Named-Entity-Recognition-for-EHR/NER-with-BidirectionalLSTM-CNN/models/idx2Label.npy",idx2Label)
np.save("/Users/sdeshpande/Desktop/bioinformatices/Clinical-Named-Entity-Recognition-for-EHR/NER-with-BidirectionalLSTM-CNN/models/word2Idx.npy",word2Idx)

train_batch,train_batch_len = createBatches(train_set)
dev_batch,dev_batch_len = createBatches(dev_set)
test_batch,test_batch_len = createBatches(test_set)

In [21]:
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1],  weights=[wordEmbeddings], trainable=False)(words_input)
casing_input = Input(shape=(None,), dtype='int32', name='casing_input')
casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input)
character_input=Input(shape=(None,52,),name='char_input')
embed_char_out=TimeDistributed(Embedding(len(char2Idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(52))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)
output = concatenate([words, casing,char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2Idx), activation='softmax'))(output)
model = Model(inputs=[words_input, casing_input,character_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()
# plot_model(model, to_file='model.png')


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         [(None, None, 52)]   0                                            
__________________________________________________________________________________________________
char_embedding (TimeDistributed (None, None, 52, 30) 2850        char_input[0][0]                 
__________________________________________________________________________________________________
dropout (Dropout)               (None, None, 52, 30) 0           char_embedding[0][0]             
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, None, 52, 30) 2730        dropout[0][0]                    
_______________________________________________________________________________________

In [22]:
for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, casing,char = batch       
        model.train_on_batch([tokens, casing,char], labels)
        a.update(i)
    a.update(i+1)
    print(' ')

model.save("/Users/sdeshpande/Desktop/bioinformatices/Clinical-Named-Entity-Recognition-for-EHR/NER-with-BidirectionalLSTM-CNN/models/model.h5")

Epoch 0/10
 
Epoch 1/10
 
Epoch 2/10
 
Epoch 3/10
 
Epoch 4/10
 
Epoch 5/10
 
Epoch 6/10
 
Epoch 7/10
 
Epoch 8/10
 
Epoch 9/10
 


In [27]:
#   Performance on dev dataset        
predLabels, correctLabels = tag_dataset(dev_batch)        
pre_dev, rec_dev, f1_dev = compute_f1(predLabels, correctLabels, idx2Label)
print("Dev-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_dev, rec_dev, f1_dev))

Dev-Data: Prec: 0.000, Rec: 0.000, F1: 0.000


In [28]:
#   Performance on test dataset       
predLabels, correctLabels = tag_dataset(test_batch)        
pre_test, rec_test, f1_test= compute_f1(predLabels, correctLabels, idx2Label)
print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.3f" % (pre_test, rec_test, f1_test))

Test-Data: Prec: 0.000, Rec: 0.000, F1: 0.000
