# Medical named entity recognition

In [137]:
def read_data(file_path):
    tokens = []
    tags = []
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

In [138]:
import pandas as pd
import numpy as np
df = pd.read_csv('/home/sneha/Documents/bioNER/bioNER.csv', header = None)
df[0].replace('  ', np.nan, inplace=True)
df= df.dropna(subset=[0])
train_tokens, train_tags = df[0], df[1]
df = pd.read_csv('/home/sneha/Documents/bioNER/bioTEST.csv', header = None)
df[0].replace('  ', np.nan, inplace=True)
df= df.dropna(subset=[0])
test_tokens, test_tags = df[0], df[1]

In [139]:
df.head()

Unnamed: 0,0,1
0,Fr,Outside
1,MyKembangSepatu,Outside
2,Common,Outside
3,asthma,B_Disease
4,steroids,B_Drug


In [140]:
(train_tokens, train_tags)

(0                 App
 1         facilitates
 2               early
 3           detection
 4           treatment
 5                  of
 6                COPD
 7        exacerbation
 8            symptoms
 9                   A
 10            digital
 11             health
 12        application
 13                for
 14                rep
 15               http
 16                  t
 17                 co
 18         xoHWYCh0Ty
 19             Saving
 20              lives
 21           creating
 22         difference
 23               thru
 24           critical
 25               care
 26                  A
 27               case
 28                 of
 29               COPD
              ...     
 23831          health
 23832         hazards
 23833              of
 23834          global
 23835         warming
 23836            http
 23837               t
 23838              co
 23839      UxoKG0dXyB
 23840          asthma
 23842               A
 23843        Milpitas
 23844     

In [141]:
test_tokens, test_tags

(0                     Fr
 1        MyKembangSepatu
 2                 Common
 3                 asthma
 4               steroids
 5                 linked
 6                     to
 7                   side
 8                effects
 9                     in
 10               adrenal
 11                glands
 12                 After
 13              stopping
 14              steroids
 15                     c
 16                  http
 17                     t
 18                    co
 19            QmBrVhEzkm
 21             Nutrition
 22             treatment
 23                    in
 24          hospitalized
 25              patients
 26                  with
 27                  COPD
 28              improved
 29                health
 30              outcomes
               ...       
 10972                for
 10973            chronic
 10974              cough
 10975            related
 10976          laryngeal
 10977         sensations
 10978               http
 10979      

In [142]:
from collections import defaultdict

In [143]:
def build_dict(tokens_or_tags, special_tokens):
    """
        tokens_or_tags: a list of lists of tokens or tags
        special_tokens: some special tokens
    """
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    
    # Create mappings from tokens to indices and vice versa
    # Add special tokens to dictionaries
    # The first special token must have index 0
    
    
    idx = 0
    for token in special_tokens:
        idx2tok.append(token)
        tok2idx[token] = idx
        idx += 1
    
    for token in tokens_or_tags:
        if token not in tok2idx:
            idx2tok.append(token)
            tok2idx[token] = idx
            idx += 1
    
    return tok2idx, idx2tok

In [144]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create dictionaries 
token2idx, idx2token = build_dict(train_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)

In [145]:
print(tag2idx)

defaultdict(<function build_dict.<locals>.<lambda> at 0x7f88d948fa60>, {'O': 0, 'Outside': 1, 'B_Disease': 2, 'B_Symptom-or-Side-Effect': 3, 'I_Disease': 4, 'I_Symptom-or-Side-Effect': 5, 'B_Drug': 6, 'I_Drug': 7})


The next additional functions will help you to create the mapping between tokens and ids for a sentence. 

In [146]:
print(idx2tag)

['O', 'Outside', 'B_Disease', 'B_Symptom-or-Side-Effect', 'I_Disease', 'I_Symptom-or-Side-Effect', 'B_Drug', 'I_Drug']


In [147]:
print(len(idx2tag))

8


In [148]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

In [149]:
words2idxs(train_tokens)

[2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 11,
 28,
 7,
 8,
 29,
 30,
 31,
 17,
 18,
 19,
 32,
 33,
 34,
 29,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 7,
 43,
 44,
 45,
 46,
 47,
 41,
 48,
 49,
 50,
 51,
 7,
 52,
 34,
 53,
 54,
 55,
 56,
 57,
 17,
 18,
 19,
 58,
 59,
 60,
 61,
 62,
 52,
 63,
 64,
 65,
 66,
 51,
 7,
 34,
 67,
 68,
 17,
 18,
 19,
 69,
 70,
 71,
 72,
 73,
 64,
 74,
 75,
 76,
 77,
 34,
 51,
 67,
 68,
 17,
 18,
 19,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 34,
 85,
 86,
 87,
 88,
 89,
 90,
 41,
 91,
 92,
 93,
 17,
 18,
 19,
 94,
 95,
 96,
 97,
 8,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 17,
 18,
 19,
 106,
 107,
 108,
 109,
 88,
 89,
 110,
 111,
 112,
 113,
 114,
 115,
 8,
 116,
 117,
 118,
 17,
 18,
 19,
 119,
 120,
 121,
 122,
 112,
 113,
 123,
 124,
 7,
 34,
 41,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 34,
 142,
 143,
 144,
 145

In [150]:
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
        print(x_list)
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            print(utt_len)
            x[n, :utt_len] = x_list[n]
            print(x[n, :utt_len])
            lengths[n] = utt_len
            print(lengths[n])
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

In [151]:
import tensorflow as tf
import numpy as np

In [152]:
class BiLSTMModel():
    pass

Placeholders:
 - *input_batch* — sequences of words (the shape equals to [batch_size, sequence_len]);
 - *ground_truth_tags* — sequences of tags (the shape equals to [batch_size, sequence_len]);
 - *lengths* — lengths of not padded sequences (the shape equals to [batch_size]);
 - *dropout_ph* — dropout keep probability; this placeholder has a predefined value 1;
 - *learning_rate_ph* — learning rate; we need this placeholder because we want to change the value during training.

In [153]:
def declare_placeholders(self):
    """Specifies placeholders for the model."""

    # Placeholders for input and ground truth output.
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch') 
    self.ground_truth_tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='ground_truth_tags')
  
    # Placeholder for lengths of the sequences.
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths')
    
    # Placeholder for a dropout keep probability. If we don't feed
    # a value for this placeholder, it will be equal to 1.0.
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    
    # Placeholder for a learning rate (tf.float32).
    self.learning_rate_ph = tf.placeholder_with_default(1e4, shape=[])

In [154]:
BiLSTMModel.__declare_placeholders = classmethod(declare_placeholders)

In [155]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden_rnn, n_tags):
    """Specifies bi-LSTM architecture and computes logits for inputs."""
    
    # Create embedding variable (tf.Variable) with dtype tf.float32
    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    embedding_matrix_variable = tf.Variable(initial_embedding_matrix, name='embeddings_matrix', dtype=tf.float32)
    
    # Create RNN cells (for example, tf.nn.rnn_cell.BasicLSTMCell) with n_hidden_rnn number of units 
    # and dropout (tf.nn.rnn_cell.DropoutWrapper), initializing all *_keep_prob with dropout placeholder.
    forward_cell = tf.nn.rnn_cell.DropoutWrapper(
        tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn, forget_bias=3.0),
        input_keep_prob=self.dropout_ph,
        output_keep_prob=self.dropout_ph,
        state_keep_prob=self.dropout_ph
    )
    backward_cell = tf.nn.rnn_cell.DropoutWrapper(
        tf.nn.rnn_cell.BasicLSTMCell(num_units=n_hidden_rnn, forget_bias=3.0),
        input_keep_prob=self.dropout_ph,
        output_keep_prob=self.dropout_ph,
        state_keep_prob=self.dropout_ph
    )

    # Look up embeddings for self.input_batch (tf.nn.embedding_lookup).
    # Shape: [batch_size, sequence_len, embedding_dim].
    embeddings = tf.nn.embedding_lookup(embedding_matrix_variable, self.input_batch)
    
    # Pass them through Bidirectional Dynamic RNN (tf.nn.bidirectional_dynamic_rnn).
    # Shape: [batch_size, sequence_len, 2 * n_hidden_rnn]. 
    # Also don't forget to initialize sequence_length as self.lengths and dtype as tf.float32.
    (rnn_output_fw, rnn_output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw= forward_cell, cell_bw= backward_cell,
        dtype=tf.float32,
        inputs=embeddings,
        sequence_length=self.lengths
    )
    rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)

    # Dense layer on top.
    # Shape: [batch_size, sequence_len, n_tags].   
    self.logits = tf.layers.dense(rnn_output, n_tags, activation=None)

In [156]:
BiLSTMModel.__build_layers = classmethod(build_layers)

In [157]:
def compute_predictions(self):
    """Transforms logits to probabilities and finds the most probable tags."""
    
    # Create softmax (tf.nn.softmax) function
    softmax_output = tf.nn.softmax(self.logits)
    
    # Use argmax (tf.argmax) to get the most probable tags
    # Don't forget to set axis=-1
    # otherwise argmax will be calculated in a wrong way
    self.predictions = tf.argmax(softmax_output, axis=-1)

In [158]:
BiLSTMModel.__compute_predictions = classmethod(compute_predictions)

In [159]:
def compute_loss(self, n_tags, PAD_index):
    """Computes masked cross-entopy loss with logits."""
    
    # Create cross entropy function function (tf.nn.softmax_cross_entropy_with_logits)
    ground_truth_tags_one_hot = tf.one_hot(self.ground_truth_tags, n_tags)
    loss_tensor = tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_tags_one_hot, logits=self.logits)
    
    # Create loss function which doesn't operate with <PAD> tokens (tf.reduce_mean)
    mask = tf.cast(tf.not_equal(loss_tensor, PAD_index), tf.float32)
    self.loss =  tf.reduce_mean(tf.reduce_sum(tf.multiply(loss_tensor, mask), axis=-1) / tf.reduce_sum(mask, axis=-1))


In [160]:
BiLSTMModel.__compute_loss = classmethod(compute_loss)

In [161]:
def perform_optimization(self):
    """Specifies the optimizer and train_op for the model."""
    
    # Create an optimizer (tf.train.AdamOptimizer)
    self.optimizer = tf.train.AdamOptimizer(self.learning_rate_ph)
    self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
    
    # Gradient clipping (tf.clip_by_norm) for self.grads_and_vars
    # Pay attention that you need to apply this operation only for gradients 
    # because self.grads_and_vars contains also variables.
    # list comprehension might be useful in this case.
    clip_norm = tf.cast(1.0, tf.float32)
    self.grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in self.grads_and_vars]
    
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

In [162]:
BiLSTMModel.__perform_optimization = classmethod(perform_optimization)

In [163]:
def init_model(self, vocabulary_size, n_tags, embedding_dim, n_hidden_rnn, PAD_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden_rnn, n_tags)
    self.__compute_predictions()
    self.__compute_loss(n_tags, PAD_index)
    self.__perform_optimization()

In [164]:
BiLSTMModel.__init__ = classmethod(init_model)

## Train the network and predict tags

In [165]:
def train_on_batch(self, session, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability):
    feed_dict = {self.input_batch: x_batch,
                 self.ground_truth_tags: y_batch,
                 self.learning_rate_ph: learning_rate,
                 self.dropout_ph: dropout_keep_probability,
                 self.lengths: lengths}
    
    session.run(self.train_op, feed_dict=feed_dict)

In [166]:
BiLSTMModel.train_on_batch = classmethod(train_on_batch)

In [167]:
def predict_for_batch(self, session, x_batch, lengths):
    
    predictions = session.run(self.predictions, feed_dict={self.input_batch:x_batch, self.lengths:lengths})
    return predictions

In [168]:
BiLSTMModel.predict_for_batch = classmethod(predict_for_batch)

In [169]:
from evaluation import precision_recall_f1

In [170]:
def predict_tags(model, session, token_idxs_batch, lengths):
    """Performs predictions and transforms indices to tokens and tags."""
    
    tag_idxs_batch = model.predict_for_batch(session, token_idxs_batch, lengths)
    
    tags_batch, tokens_batch = [], []
    for tag_idxs, token_idxs in zip(tag_idxs_batch, token_idxs_batch):
        tags, tokens = [], []
        for tag_idx, token_idx in zip(tag_idxs, token_idxs):
            tags.append(idx2tag[tag_idx])
            tokens.append(idx2token[token_idx])
        tags_batch.append(tags)
        tokens_batch.append(tokens)
    return tags_batch, tokens_batch
    
    
def eval_conll(model, session, tokens, tags, short_report=True):
    """Computes NER quality measures using CONLL shared task script."""
    
    y_true, y_pred = [], []
    for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
        tags_batch, tokens_batch = predict_tags(model, session, x_batch, lengths)
        if len(x_batch[0]) != len(tags_batch[0]):
            raise Exception("Incorrect length of prediction for the input, "
                            "expected length: %i, got: %i" % (len(x_batch[0]), len(tags_batch[0])))
        predicted_tags = []
        ground_truth_tags = []
        for gt_tag_idx, pred_tag, token in zip(y_batch[0], tags_batch[0], tokens_batch[0]): 
            if token != '<PAD>':
                ground_truth_tags.append(idx2tag[gt_tag_idx])
                predicted_tags.append(pred_tag)

        # We extend every prediction and ground truth sequence with 'O' tag
        # to indicate a possible end of entity.
        y_true.extend(ground_truth_tags + ['O'])
        y_pred.extend(predicted_tags + ['O'])
        
    results = precision_recall_f1(y_true, y_pred, print_results=True, short_report=short_report)
    return results

In [171]:
tf.reset_default_graph()

model = BiLSTMModel(20505, 21, 200, 200, token2idx['<PAD>'])

batch_size = 32
n_epochs = 25
learning_rate = 0.02
learning_rate_decay = 1.4
dropout_keep_probability = 0.5

In [172]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

print('Start training... \n')
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    print('Train data evaluation:')
    eval_conll(model, sess, train_tokens, train_tags, short_report=True)
    
    
    # Train the model
    for x_batch, y_batch, lengths in batches_generator(batch_size, train_tokens, train_tags):
        model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_probability)
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

Start training... 

-------------------- Epoch 1 of 25 --------------------
Train data evaluation:
[[213, 403, 18, 386, 213]]
5
[213 403  18 386 213]
5


ValueError: cannot copy sequence with size 7 to array axis with dimension 5

In [None]:
print('-' * 20 + ' Train set quality: ' + '-' * 20)
train_results = eval_conll(model, sess, train_tokens, train_tags, short_report=False)

print('-' * 20 + ' Test set quality: ' + '-' * 20)
test_results = eval_conll(model, sess, test_tokens, test_tags, short_report=False)