Next: https://github.com/arunarn2/HierarchicalAttentionNetworks/blob/master/HierarchicalAttn.py

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt
# from IPython import display

In [54]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score as scikit_f1_score

In [5]:
import tensorflow as tf
import tensorflow_hub as hub

In [6]:
from keras import backend as K

from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Lambda
from keras.regularizers import l2

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


#### Custom Functions

In [7]:
from src.callbacks import PlotCurves
from src.custom_functions import f1_macro, f1_micro 
from src.load_data import load_data

### Load Data

In [8]:
train_data, valid_data, test_data, metadata = load_data()

### Prepare data

In [20]:
ww = 3
n_tags = 2

In [22]:
def get_input(data_, ww, n_tags, is_test=False):
    
    padding_sent = {
        'sentence': 'ENDPAD',
        'label': 0
    }

    X = []
    y = []

    for article in data_:

        for si, sentence in enumerate(article['sentences']):

            prev_ = article['sentences'][si-1] if si-1 >= 0 else padding_sent
            next_ = article['sentences'][si+1] if si+1 < len(article['sentences']) else padding_sent

            X.append([
                prev_['sentence'], 
                sentence['sentence'],
                next_['sentence']
            ])
            
            if not is_test:
                y.append([
                    to_categorical(prev_['label'], num_classes=n_tags), 
                    to_categorical(sentence['label'], num_classes=n_tags),
                    to_categorical(next_['label'], num_classes=n_tags)
                ])
            else:
                y.append(sentence['label'])
            
    X = np.array(X)
    y = np.array(y)
            
    return X, y

In [23]:
X_tra, y_tra = get_input(train_data, ww, n_tags, False)
X_val, y_val = get_input(valid_data, ww, n_tags, False)
X_test, y_test = get_input(test_data, ww, n_tags, True)

#### Limit Data

In [24]:
X_tra = X_tra[:3582]
y_tra = y_tra[:3582]

In [25]:
X_tra.shape, type(X_tra)

((3582, 3), numpy.ndarray)

In [26]:
X_val = X_val[:398]
y_val = y_val[:398]

In [27]:
X_val.shape, y_val.shape, type(X_val)

((398, 3), (398, 3, 2), numpy.ndarray)

In [41]:
X_test = X_test[:440]
y_test = y_test[:440]

In [42]:
X_test.shape, y_test.shape, type(X_test)

((440, 3), (440,), numpy.ndarray)

### Load ELMo

In [28]:
sess = tf.Session()
K.set_session(sess)

In [29]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

## Build Model

In [30]:
def ELMoEmbedding(x):
    embeds = []
    for art in tf.unstack(tf.transpose(x, (1, 0))):
        embeds.append(elmo(tf.squeeze(tf.cast(art, tf.string)), signature="default", as_dict=True)["default"])
    return tf.stack(embeds, 1)

In [33]:
def build_lstm_model(ww, n_tags):
    
    input_text = Input(shape=(ww,), dtype="string")
    
    embedding = Lambda(ELMoEmbedding, output_shape=(None, None, ww, 1024))(input_text)
    
    dns = Dense(512, activation='relu')(embedding)
    
    dns = Dense(256, activation='relu')(dns)
    
    x = Bidirectional(LSTM(units=128, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(dns)

    x_rnn = Bidirectional(LSTM(units=128, return_sequences=True,
                               recurrent_dropout=0.2, dropout=0.2))(x)

    x = add([x, x_rnn])  # residual connection to the first biLSTM

    out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
    
    return Model(input_text, outputs=out)

In [35]:
model = build_lstm_model(ww, n_tags)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 3)            0                                            
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, None, None, 3 0           input_3[0][0]                    
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, None, None, 3 524800      lambda_2[0][0]                   
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, None, None, 3 131328      dense_4[0][0]                    
__________________________________________________________________________________________________
bidirectio

In [37]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Train Model

In [38]:
## Sequential Model
model.fit(X_tra, y_tra, epochs=20, batch_size=2, validation_data=(X_val, y_val)) 
#            callbacks=[PlotCurves(model_name='elmo_sentence_sequence')])

Train on 3582 samples, validate on 398 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
  62/3582 [..............................] - ETA: 3:58 - loss: 0.0353 - acc: 0.9892

KeyboardInterrupt: 

### Test

In [43]:
model_preds = model.predict(X_test, batch_size=2)

In [44]:
y_preds = [[np.argmax(lab) for lab in art] for art in model_preds]

In [51]:
y_preds_unpad = [lab[1] for lab in y_preds]

In [53]:
### Sequential Model results
print(classification_report(y_test, y_preds_unpad))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89       325
           1       0.76      0.46      0.57       115

   micro avg       0.82      0.82      0.82       440
   macro avg       0.79      0.70      0.73       440
weighted avg       0.81      0.82      0.80       440



In [56]:
scikit_f1_score(y_test, y_preds_unpad, average='macro')

0.7296519541123858