In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np

In [3]:
import tensorflow as tf
import tensorflow_hub as hub

In [4]:
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Activation, Conv1D, \
                                    MaxPooling1D, Flatten, Reshape, Bidirectional, \
                                    BatchNormalization, Dropout, add, LSTM, \
                                    TimeDistributed
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax, SGD
from tensorflow.keras.regularizers import l2

In [5]:
from src.keras_bert import convert_single_example, \
                           convert_text_to_examples, \
                           create_tokenizer_from_hub_module, \
                           convert_examples_to_features, \
                           InputExample, \
                           initialize_vars, \
                           BertLayer

In [6]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score as scikit_f1_score

In [8]:
# Initialize session
sess = tf.compat.v1.Session()

In [9]:
with tf.device("gpu:0"):
    print("GPU enabled")

GPU enabled


#### Custom Functions

In [10]:
from src.callbacks import PlotCurvesTF as PlotCurves
from src.eval_metrics import f1_macro, f1_micro 
from src.load_data import load_data

### Load Data

In [39]:
train_data, valid_data, test_data, metadata = load_data()

In [46]:
for a, article in enumerate(valid_data):
    for s, sent in enumerate(article['sentences']):
        lent =len(sent['sentence'].split())
        if lent > 90:
             del valid_data[a]['sentences'][s]

### Load BERT

In [13]:
os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'

In [14]:
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [15]:
bert = hub.Module(bert_path, trainable=True)

### Prepare data

In [16]:
ww = 1
batch_size = 2

max_seq_length = 512
if max_seq_length > 512:
    print('!!!!!!! WARNING: BERT does not accept length > 512')
    max_seq_length = 512

In [17]:
def get_padding_sentence(max_seq_length, tokenizer, padding_text='ENDPAD'):

    example_sent = InputExample(guid=None, text_a=" ".join(padding_text), text_b=None, label=0)

    (input_ids, input_mask, segment_ids, label) = \
        convert_single_example(tokenizer, example_sent, max_seq_length=max_seq_length)

    return {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "label": 0}

In [18]:
def get_input(data_, ww, max_seq_length, batch_size, limit=None):

    tokenizer = create_tokenizer_from_hub_module(bert_path)

    padding_sent = get_padding_sentence(max_seq_length, tokenizer, padding_text='ENDPAD')

    X = []
    y = []
    for article in data_:

        X_art = np.array([[" ".join(sentence['sentence'].replace('\n', ' ').strip().split()[0:max_seq_length])]
                          for sentence in article['sentences']], dtype=object)

        y_art = [sentence['label'] for sentence in article['sentences']]

        examples_ = convert_text_to_examples(X_art, y_art)

        (input_ids, input_masks, segment_ids, labels_) = \
            convert_examples_to_features(tokenizer, examples_, max_seq_length=max_seq_length)

        for si, _ in enumerate(article['sentences']):

            input_ids_seq = []
            input_mask_seq = []
            segment_ids_seq = []
            y_seq = []

            # Prev
            for i in reversed(range(ww)):

                if si - i - 1 >= 0:
                    sent_obj_prev = {"input_ids": input_ids[si - i - 1],
                                     "input_mask": input_masks[si - i - 1],
                                     "segment_ids": segment_ids[si - i - 1]}
                else:
                    sent_obj_prev = padding_sent

                input_ids_seq.append(sent_obj_prev['input_ids'])
                input_mask_seq.append(sent_obj_prev['input_mask'])
                segment_ids_seq.append(sent_obj_prev['segment_ids'])

            # Curr
            sent_obj = {"input_ids": input_ids[si],
                        "input_mask": input_masks[si],
                        "segment_ids": segment_ids[si]}

            input_ids_seq.append(sent_obj['input_ids'])
            input_mask_seq.append(sent_obj['input_mask'])
            segment_ids_seq.append(sent_obj['segment_ids'])
            y_seq.append(labels_[si][0])

            # Next
            for i in range(ww):

                if si + i + 1 < len(article['sentences']):
                    sent_obj_next = {"input_ids": input_ids[si + i + 1],
                                     "input_mask": input_masks[si + i + 1],
                                     "segment_ids": segment_ids[si + i + 1]}
                else:
                    sent_obj_next = padding_sent

                input_ids_seq.append(sent_obj_next['input_ids'])
                input_mask_seq.append(sent_obj_next['input_mask'])
                segment_ids_seq.append(sent_obj_next['segment_ids'])

            X_seq = (np.array(input_ids_seq),
                     np.array(input_mask_seq),
                     np.array(segment_ids_seq))

            X.append(X_seq)
            y.append(y_seq)

    # limit data if not an even number when batch_size=2
    if not limit:
        limit = len(X) if len(X) % batch_size == 0 else len(X) - len(X) % batch_size
    X = X[:limit]
    y = y[:limit]

    return np.array(X), np.array(y)

In [19]:
X_tra, y_tra = get_input(train_data, ww, max_seq_length, batch_size)
X_val, y_val = get_input(valid_data, ww, max_seq_length, batch_size)

Converting examples to features: 100%|██████████| 10/10 [00:00<00:00, 2088.59it/s]
Converting examples to features: 100%|██████████| 16/16 [00:00<00:00, 2619.29it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 1694.13it/s]
Converting examples to features: 100%|██████████| 13/13 [00:00<00:00, 2244.79it/s]
Converting examples to features: 100%|██████████| 33/33 [00:00<00:00, 2340.18it/s]
Converting examples to features: 100%|██████████| 11/11 [00:00<00:00, 2130.37it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 3319.85it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 1257.91it/s]
Converting examples to features: 100%|██████████| 17/17 [00:00<00:00, 1741.99it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2498.99it/s]
Converting examples to features: 100%|██████████| 22/22 [00:00<00:00, 2665.43it/s]
Converting examples to features: 100%|██████████| 18/18 [00:00<00:00, 2543.89it/s]
Converti

Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 1516.07it/s]
Converting examples to features: 100%|██████████| 9/9 [00:00<00:00, 1137.52it/s]
Converting examples to features: 100%|██████████| 11/11 [00:00<00:00, 1507.66it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 1444.57it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 1555.03it/s]
Converting examples to features: 100%|██████████| 20/20 [00:00<00:00, 2618.41it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2784.14it/s]
Converting examples to features: 100%|██████████| 6/6 [00:00<00:00, 2317.94it/s]
Converting examples to features: 100%|██████████| 7/7 [00:00<00:00, 2012.76it/s]
Converting examples to features: 100%|██████████| 53/53 [00:00<00:00, 2912.71it/s]
Converting examples to features: 100%|██████████| 11/11 [00:00<00:00, 2255.11it/s]
Converting examples to features: 100%|██████████| 15/15 [00:00<00:00, 2033.77it/s]
Converting e

Converting examples to features: 100%|██████████| 3/3 [00:00<00:00, 2291.55it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2158.81it/s]
Converting examples to features: 100%|██████████| 20/20 [00:00<00:00, 2323.01it/s]
Converting examples to features: 100%|██████████| 19/19 [00:00<00:00, 2867.54it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 3007.75it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2469.83it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2378.73it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2058.81it/s]
Converting examples to features: 100%|██████████| 17/17 [00:00<00:00, 2471.26it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 2768.97it/s]
Converting examples to features: 100%|██████████| 9/9 [00:00<00:00, 2616.35it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2337.70it/s]
Converting example

In [20]:
X_tra.shape, X_val.shape

((3582, 3, 3, 512), (398, 3, 3, 512))

In [21]:
y_tra.shape, y_val.shape

((3582, 1), (398, 1))

## Build Model

In [22]:
def BERTEmbeddingStack(x):
    embeds = []
    for art in tf.unstack(tf.reshape(x, (batch_size, 3, 2*ww+1, 512))):
        art = tf.cast(art, dtype="int32")
        # Below does not change the shape of segment_ids etc.
        # Only puts them into a dictionary
        bert_inputs = dict(
            input_ids=art[0],
            input_mask=art[1],
            segment_ids=art[2]
        )
        # Pooling
        result = bert(bert_inputs, signature="tokens", as_dict=True)["sequence_output"]
        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                             tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
        input_mask = tf.cast(art[1], tf.float32)
        pooled = masked_reduce_mean(result, input_mask)
        embeds.append(pooled)
    # print(tf.stack(embeds, 0))
    return tf.stack(embeds, 0)

In [23]:
def build_model_0(ww, max_seq_length):

    inp_size = 2 * ww + 1
    input_text = Input(shape=(3, inp_size, max_seq_length))

    bert_output = Lambda(BERTEmbeddingStack, output_shape=(None, None, inp_size, 768))(input_text)

    x = Bidirectional(LSTM(units=256, return_sequences=True))(bert_output)

    pred = LSTM(1, activation='softmax')(x)

    return Model(inputs=[input_text], outputs=pred)

In [24]:
learningrate = 2e-5
optimizer = Adam(lr=learningrate)
optimizer_str = 'adam'
loss = 'binary_crossentropy'
metrics = ['acc', f1_macro, f1_micro]

In [25]:
model = build_model_0(ww, max_seq_length)
model.summary()

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

W1229 23:50:03.362170 140563095122048 deprecation.py:506] From /home/aorus/workspaces/simge/Master_Thesis/.env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1229 23:50:03.365644 140563095122048 deprecation.py:506] From /home/aorus/workspaces/simge/Master_Thesis/.env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1229 23:50:03.366473 140563095122048 deprecation.py:506] From /home/aorus/workspaces/simge/Master_Thesis/.env/lib/python3.6/site-packag

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 3, 3, 512)]       0         
_________________________________________________________________
lambda (Lambda)              (2, 3, 768)               0         
_________________________________________________________________
bidirectional (Bidirectional (2, 3, 512)               2099200   
_________________________________________________________________
lstm_1 (LSTM)                (2, 1)                    2056      
Total params: 2,101,256
Trainable params: 2,101,256
Non-trainable params: 0
_________________________________________________________________


In [26]:
epochs = 1

In [27]:
model_name = 'RQ2_test_bert_many_to_one_model_01' + \
             '_ww_' + str(ww) + \
             '_' + optimizer_str + \
             '_lr_' + str(learningrate) + \
             '_epochs_' + str(epochs) + \
             '_loss_' + loss + \
             '_onehot' + \
             '_softmax'

model_dir = './Model/' + model_name.split('model')[0] + 'model/' + model_name
results_file = os.path.join(model_dir, 'model_results_file.txt')

### Train Model

In [28]:
# Instantiate variables
initialize_vars(sess)

In [29]:
model.fit(X_tra, y_tra, 
          epochs=1,
          batch_size=batch_size, 
          validation_data=(X_val, y_val))

Train on 3582 samples, validate on 398 samples


<tensorflow.python.keras.callbacks.History at 0x7fd2eb536470>

### Load the best Model

In [31]:
model_name

'RQ2_test_bert_many_to_one_model_01_ww_1_adam_lr_2e-05_epochs_1_loss_binary_crossentropy_onehot_softmax'

In [32]:
best_model = model

### Evaluation

In [33]:
def get_scores(model, batch_size, ww, max_seq_length, data_=None, X=None, y_true=None, results_file=None, print_out=False):

    if data_:
        X, y_true = get_input(data_, ww, max_seq_length, batch_size, limit=None)
    y_true = [y[0] for y in y_true]

    y_preds = model.predict(X, batch_size=batch_size)
    y_preds = [0 if y[0] < 0.5 else 1 for y in y_preds]

    clsrpt = classification_report(y_true, y_preds)
    sf1 = scikit_f1_score(y_true, y_preds)
    sfm = scikit_f1_score(y_true, y_preds, average='macro')

    if print_out:
        print(clsrpt)
        print('\nScikit_F1_Macro:', sfm)
        print('\nScikit_F1_1:', sf1)

    if results_file:
        with open(results_file, 'a') as f:
            f.write('\n' + clsrpt + '\nF1_Macro: ' + str(sfm) + '\nF1_1: ' + str(sf1) + '\n\n')
    return sfm

#### Validation Set

In [34]:
with open(results_file, 'w') as f:
    f.write('\n---------------- Validation ----------------\n')

val_f1 = get_scores(model, batch_size, ww, max_seq_length, X=X_val, y_true=y_val, 
                    results_file=results_file, print_out=True)

#### Test Set

In [None]:
X, y_true = get_input(data_, ww, max_seq_length, batch_size, limit=None)

In [38]:
with open(results_file, 'a') as f:
    f.write('\n---------------- Test ----------------\n')
test_f1 = get_scores(best_model, batch_size, ww, max_seq_length, data_=test_data,
                     results_file=results_file, print_out=True)

Converting examples to features: 100%|██████████| 23/23 [00:00<00:00, 3031.42it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 1768.90it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2260.73it/s]
Converting examples to features: 100%|██████████| 32/32 [00:00<00:00, 2675.05it/s]
Converting examples to features: 100%|██████████| 23/23 [00:00<00:00, 2501.98it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 3252.03it/s]
Converting examples to features: 100%|██████████| 4/4 [00:00<00:00, 1505.09it/s]
Converting examples to features: 100%|██████████| 9/9 [00:00<00:00, 2073.08it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2337.70it/s]
Converting examples to features: 100%|██████████| 34/34 [00:00<00:00, 3001.61it/s]
Converting examples to features: 100%|██████████| 10/10 [00:00<00:00, 2639.42it/s]
Converting examples to features: 100%|██████████| 10/10 [00:00<00:00, 1756.41it/s]
Converting

              precision    recall  f1-score   support

           0       0.79      0.98      0.87       325
           1       0.81      0.26      0.39       115

   micro avg       0.79      0.79      0.79       440
   macro avg       0.80      0.62      0.63       440
weighted avg       0.79      0.79      0.75       440


Scikit_F1_Macro: 0.6341816078658185

Scikit_F1_1: 0.39473684210526316
