In [1]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
plt.style.use("ggplot")
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="3";  

In [3]:
df_train = pd.read_csv("train.csv", usecols=["Sent_ID","Word","tag"])
df_train["Word"] = df_train.Word.fillna("NAN")

In [4]:
df_train.tail()

Unnamed: 0,Sent_ID,Word,tag
4543828,191282,translational,O
4543829,191282,or,O
4543830,191282,post-translational,O
4543831,191282,level,O
4543832,191282,.,O


In [5]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced',
                                             np.unique(df_train.tag.values),
                                             df_train.tag.values)

In [6]:
 np.unique(df_train.tag.values)

array(['B-indications', 'I-indications', 'O'], dtype=object)

In [7]:
dict_w = {'B-indications':28.57594853, 'I-indications': 33.94162334, 'O':0.34065246}

In [8]:
class_weight

array([28.57594853, 33.94162334,  0.34065246])

In [9]:
df_test = pd.read_csv("test.csv", usecols=["Sent_ID","Word"])
df_test["Word"] = df_test.Word.fillna("NAN")
df = pd.concat([df_train, df_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
df_test.shape

(2994463, 2)

In [11]:
df.tail()

Unnamed: 0,Sent_ID,Word,tag
2994458,317122,/a,
2994459,317122,>,
2994460,317122,<,
2994461,317122,/p,
2994462,317122,>,


In [12]:
words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

257203

In [13]:
tags = list(set(df_train["tag"].values))
n_tags = len(tags); n_tags

3

In [14]:
print(df_train['Sent_ID'].nunique(), df_train.Word.nunique(), df_train.tag.nunique(),  df_train.Word.count())

191282 184506 3 4543833


In [15]:
print(df_test['Sent_ID'].nunique(), df_test.Word.nunique(), df_test.Word.count())

125840 139892 2994463


In [16]:
print(df['Sent_ID'].nunique(), df.Word.nunique(), df.tag.nunique(),  df.Word.count())

317122 257202 3 7538296


In [17]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,t) for w,t in zip(s['Word'].values.tolist(),
                                                     s['tag'].values.tolist())]
        self.grouped = self.data.groupby(["Sent_ID",data.index // 100 * 100]).apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s 
        except:
            return None

In [18]:
getter_train = SentenceGetter(df_train)

sentences_train = getter_train.sentences

In [19]:
len(sentences_train)

234843

In [20]:
getter = SentenceGetter(df)

sentences = getter.sentences

In [21]:
max_len = max([len(s) for s in sentences])
print ('Maximum sequence length:', max_len)

Maximum sequence length: 100


In [22]:
len(sentences)

389383

In [23]:
sentences[23]

[('Finally', 'O'),
 (',', 'O'),
 ('we', 'O'),
 ('observed', 'O'),
 ('significant', 'O'),
 ('correlation', 'O'),
 ('between', 'O'),
 ('simultaneous', 'O'),
 ('LDF', 'O'),
 ('and', 'O'),
 ('LSCI', 'O'),
 ('measurements', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('PORH', 'O'),
 ('peak', 'O'),
 ('CVC', 'O'),
 ('(', 'O'),
 ('R=0.54', 'O'),
 (';', 'O'),
 ('p=0.001', 'O'),
 (')', 'O'),
 ('.', 'O')]

In [24]:
sentences_test = sentences[234843:]

In [25]:
len(sentences_test)

154540

In [26]:
tag2idx = {t: i for i, t in enumerate(tags)}
#tag2idx.pop(np.nan)
print(tag2idx)

{'O': 0, 'I-indications': 1, 'B-indications': 2}


In [27]:
X_train = [[w[0] for w in s] for s in sentences_train]

In [28]:
new_X = []
for seq in X_train:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X_train = new_X

In [29]:
X_train[1]

['We',
 'have',
 'reviewed',
 'the',
 'distinctive',
 'features',
 'of',
 'excess',
 'weight',
 ',',
 'its',
 'causes',
 ',',
 'and',
 'related',
 'prevention',
 'and',
 'management',
 'efforts',
 ',',
 'as',
 'well',
 'as',
 'data',
 'gaps',
 'and',
 'recommendations',
 'for',
 'future',
 'research',
 'in',
 'low-',
 'and',
 'middle-income',
 'countries',
 '(',
 'LMICs',
 ')',
 '.',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 '__PAD__',
 

In [30]:
y_train = [[tag2idx[w[1]] for w in s] for s in sentences_train]

In [31]:
y_train[2]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [32]:
tag2idx["O"]

0

In [33]:
from keras.preprocessing.sequence import pad_sequences
y_train = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx["O"])

In [34]:
y_train[2]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [35]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.1, random_state=2018)

In [48]:
batch_size = 256

In [49]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

In [50]:
sess = tf.Session()
K.set_session(sess)

In [51]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [52]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [53]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [54]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0324 23:08:11.755050 139996224579328 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [55]:
model = Model(input_text, out)

In [56]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [57]:
y_tr2 = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)

In [59]:
X_tr2 = np.array(X_tr)

In [60]:
history = model.fit(X_tr2, y_tr2 , batch_size=batch_size, epochs=5, validation_split=0.2, verbose=1)

Train on 169086 samples, validate on 42272 samples
Epoch 1/5
   512/169086 [..............................] - ETA: 2:57:09 - loss: 0.0511 - acc: 0.9954

KeyboardInterrupt: 