In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [1]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['pos_tags'].values.tolist(), 
                                                           s['tag'].values.tolist())]
        self.grouped = self.data.groupby('Sent_ID').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [3]:
train = pd.read_pickle('train_df.pkl')
test = pd.read_pickle('test_df.pkl')

In [4]:
getter = SentenceGetter(train)
sentences = getter.sentences

In [8]:
train.groupby('Sent_ID').count()['Word'].sort_values(ascending=False)

Sent_ID
96020     3899
101264    3881
129375     574
96019      411
131946     410
101263     386
145363     369
168460     361
142445     343
168759     341
185352     339
17671      313
91001      307
94554      292
35965      284
32688      268
166917     262
69992      237
163729     233
126362     228
167493     228
159514     227
166422     220
150244     214
16124      204
984        204
78377      203
117350     202
17113      198
124379     197
          ... 
56221        1
95911        1
466          1
127061       1
157426       1
136075       1
97785        1
39192        1
46532        1
72159        1
564          1
96772        1
44481        1
131524       1
16678        1
90992        1
93691        1
16308        1
90993        1
101074       1
79395        1
92523        1
16052        1
100825       1
111418       1
90464        1
146718       1
23284        1
93833        1
94074        1
Name: Word, Length: 191282, dtype: int64

In [9]:
### Remove the top 2
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['pos_tags'].values.tolist(), 
                                                           s['tag'].values.tolist())]
        self.data_reduced = self.data[~self.data['Sent_ID'].isin([96020, 101264])]
        self.grouped = self.data_reduced.groupby('Sent_ID').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [11]:
getter = SentenceGetter(train)
sentences = getter.sentences

In [12]:
len(sentences)

191280

In [13]:
maxlen = max([len(s) for s in sentences])

In [15]:
words = list(set(train["Word"].values))
words.append("ENDPAD")
n_words = len(words)
tags = list(set(train["tag"].values))
n_tags = len(tags)

In [16]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [18]:
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=maxlen, sequences=X, padding="post",value=n_words - 1)

In [22]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [25]:
input = Input(shape=(maxlen,))
model = Embedding(input_dim=n_words, output_dim=maxlen, input_length=maxlen)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

In [26]:
model = Model(input, out)

In [27]:
def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [28]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy", f1_score])

In [31]:
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=1, validation_split=0.2, verbose=1)

  num_elements)


Train on 102525 samples, validate on 25632 samples
Epoch 1/1
  1440/102525 [..............................] - ETA: 5:39:45 - loss: 0.0832 - acc: 0.9986 - f1_score: nan

KeyboardInterrupt: 

In [33]:
train['tag'].value_counts(True)

O                0.978514
B-indications    0.011665
I-indications    0.009821
Name: tag, dtype: float64