In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("C:/Users/Dell/Documents/ncbi_train_v2.csv", encoding="latin1")


In [None]:
data = data.fillna(method="ffill")


In [None]:
data.head(10)


Unnamed: 0,Sentence,Word,Tag
0,Sentence:1,Selegiline,O
1,Sentence:1,-,O
2,Sentence:1,induced,O
3,Sentence:1,postural,B-Disease
4,Sentence:1,hypotension,E-Disease
5,Sentence:1,in,O
6,Sentence:1,Parkinson,B-Disease
7,Sentence:1,',I-Disease
8,Sentence:1,s,I-Disease
9,Sentence:1,disease,E-Disease


In [None]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words


9566

In [None]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

tags

['O', 'E-Disease', 'I-Disease', 'S-Disease', 'B-Disease']

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence:{}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [None]:
getter = SentenceGetter(data)


In [None]:
sent = getter.get_next()


In [None]:
print(sent)


[('Selegiline', 'O'), ('-', 'O'), ('induced', 'O'), ('postural', 'B-Disease'), ('hypotension', 'E-Disease'), ('in', 'O'), ('Parkinson', 'B-Disease'), ("'", 'I-Disease'), ('s', 'I-Disease'), ('disease', 'E-Disease'), (':', 'O'), ('a', 'O'), ('longitudinal', 'O'), ('study', 'O'), ('on', 'O'), ('the', 'O'), ('effects', 'O'), ('of', 'O'), ('drug', 'O'), ('withdrawal', 'O'), ('.', 'O'), ('.', 'O')]


In [None]:
sentences = getter.sentences



In [None]:
max_len = 75
max_len_char = 10

word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}


In [None]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]


Using TensorFlow backend.


In [None]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)
X

array([[3672, 4507, 1854, ...,    0,    0,    0],
       [5273, 3440, 4074, ...,    0,    0,    0],
       [9419, 8681, 3560, ...,    0,    0,    0],
       ...,
       [5073, 9165, 9298, ...,    0,    0,    0],
       [5359, 7694, 3816, ...,    0,    0,    0],
       [2839, 5921, 7236, ...,    0,    0,    0]])

In [None]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]


In [None]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])


In [None]:
from keras.utils import to_categorical


In [None]:
y = [to_categorical(i, num_classes=n_tags) for i in y]


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)


In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF


In [None]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output


In [None]:
model = Model(input, out)


In [None]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [None]:
print(X_tr.shape)
x_train = X_tr
y_train = y_tr
x_train=x_train.reshape(x_train.shape[0],x_train.shape[1],1)
#y_train=y_train.reshape(y_train.shape[0],y_train.shape[1],1)
print(x_train.shape) #y_train.shape)

history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5,validation_split=0.1, verbose=1)



(3831, 75)
(3831, 75, 1)

Train on 3447 samples, validate on 384 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
hist = pd.DataFrame(history.history)


In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [None]:
test_pred = model.predict(X_te, verbose=1)





In [None]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [None]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))


F1-score: 65.4%


In [None]:
print(classification_report(test_labels, pred_labels))


           precision    recall  f1-score   support

  Disease       0.68      0.63      0.65       395

micro avg       0.68      0.63      0.65       395
macro avg       0.68      0.63      0.65       395



In [None]:
i = 2345
p = model.predict(np.array([X_tr[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_tr[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_tr[i], true, p[0]):
    if w != 0:
        print("{:25}: {:5} {}".format(words[w-1], tags[t], tags[pred]))


Word           ||True ||Pred
Doxorubicin              : O     O
is                       : O     O
an                       : O     O
effective                : O     O
anticancer               : O     O
chemotherapeutic         : O     O
agent                    : O     O
known                    : O     O
to                       : O     O
cause                    : O     O
acute                    : O     O
and                      : O     O
chronic                  : O     B-Disease
cardiomyopathy           : S-Disease E-Disease
.                        : O     O
.                        : O     O
