In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np

In [3]:
import tensorflow as tf
import tensorflow_hub as hub

In [4]:
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Activation, Conv1D, \
                                    MaxPooling1D, Flatten, Reshape, Bidirectional, \
                                    BatchNormalization, Dropout, add, LSTM, \
                                    TimeDistributed
from tensorflow.keras.optimizers import RMSprop, Adam, Adamax, SGD
from tensorflow.keras.regularizers import l2

In [5]:
from src.keras_bert import convert_single_example, \
                           convert_text_to_examples, \
                           create_tokenizer_from_hub_module, \
                           convert_examples_to_features, \
                           InputExample, \
                           initialize_vars, \
                           BertLayer

In [6]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score as scikit_f1_score

In [8]:
# Initialize session
sess = tf.compat.v1.Session()

In [9]:
with tf.device("gpu:0"):
    print("GPU enabled")

GPU enabled


#### Custom Functions

In [10]:
from src.callbacks import PlotCurvesTF as PlotCurves
from src.eval_metrics import f1_macro, f1_micro 
from src.load_data import load_data

### Load Data

In [11]:
train_data, valid_data, test_data, metadata = load_data()

### Load BERT

In [12]:
os.environ["TFHUB_CACHE_DIR"] = '/tmp/tfhub'

In [13]:
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [14]:
bert = hub.Module(bert_path, trainable=True)

### Prepare data

In [15]:
ww = 1
batch_size = 32

max_seq_length = 512
if max_seq_length > 512:
    print('!!!!!!! WARNING: BERT does not accept length > 512')
    max_seq_length = 512

In [16]:
def get_padding_sentence(max_seq_length, tokenizer, padding_text='ENDPAD'):

    example_sent = InputExample(guid=None, text_a=" ".join(padding_text), text_b=None, label=0)

    (input_ids, input_mask, segment_ids, label) = \
        convert_single_example(tokenizer, example_sent, max_seq_length=max_seq_length)

    return {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "label": 0}

In [17]:
def get_input(data_, ww, max_seq_length, batch_size, one_hot=False, limit=None):

    tokenizer = create_tokenizer_from_hub_module(bert_path)

    padding_sent = get_padding_sentence(max_seq_length, tokenizer, padding_text='ENDPAD')

    X = []
    y = []
    for article in data_:

        X_art = np.array([[" ".join(sentence['sentence'].replace('\n', ' ').strip().split()[0:max_seq_length])]
                          for sentence in article['sentences']], dtype=object)

        y_art = [sentence['label'] for sentence in article['sentences']]

        examples_ = convert_text_to_examples(X_art, y_art)

        (input_ids, input_masks, segment_ids, labels_) = \
            convert_examples_to_features(tokenizer, examples_, max_seq_length=max_seq_length)

        for si, _ in enumerate(article['sentences']):

            input_ids_seq = []
            input_mask_seq = []
            segment_ids_seq = []
            y_seq = []

            # Prev
            for i in reversed(range(ww)):

                if si - i - 1 >= 0:
                    sent_obj_prev = {"input_ids": input_ids[si - i - 1],
                                     "input_mask": input_masks[si - i - 1],
                                     "segment_ids": segment_ids[si - i - 1]}
                else:
                    sent_obj_prev = padding_sent

                input_ids_seq.append(sent_obj_prev['input_ids'])
                input_mask_seq.append(sent_obj_prev['input_mask'])
                segment_ids_seq.append(sent_obj_prev['segment_ids'])

            # Curr
            sent_obj = {"input_ids": input_ids[si],
                        "input_mask": input_masks[si],
                        "segment_ids": segment_ids[si]}

            input_ids_seq.append(sent_obj['input_ids'])
            input_mask_seq.append(sent_obj['input_mask'])
            segment_ids_seq.append(sent_obj['segment_ids'])
            if one_hot:
                y_seq = to_categorical(labels_[si][0], num_classes=2)
            else:
                y_seq.append(labels_[si][0])

            # Next
            for i in range(ww):

                if si + i + 1 < len(article['sentences']):
                    sent_obj_next = {"input_ids": input_ids[si + i + 1],
                                     "input_mask": input_masks[si + i + 1],
                                     "segment_ids": segment_ids[si + i + 1]}
                else:
                    sent_obj_next = padding_sent

                input_ids_seq.append(sent_obj_next['input_ids'])
                input_mask_seq.append(sent_obj_next['input_mask'])
                segment_ids_seq.append(sent_obj_next['segment_ids'])

            X_seq = (np.array(input_ids_seq),
                     np.array(input_mask_seq),
                     np.array(segment_ids_seq))

            X.append(X_seq)
            y.append(y_seq)

    # limit data if not an even number when batch_size=2
    if not limit:
        limit = len(X) if len(X) % batch_size == 0 else len(X) - len(X) % batch_size
    X = X[:limit]
    y = y[:limit]

    return np.array(X), np.array(y)

In [18]:
X_tra, y_tra = get_input(train_data, ww, max_seq_length, batch_size, one_hot=False, limit=None)
X_val, y_val = get_input(valid_data, ww, max_seq_length, batch_size, one_hot=False, limit=None)

Converting examples to features: 100%|██████████| 10/10 [00:00<00:00, 1983.12it/s]
Converting examples to features: 100%|██████████| 16/16 [00:00<00:00, 2492.81it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2241.92it/s]
Converting examples to features: 100%|██████████| 13/13 [00:00<00:00, 2252.67it/s]
Converting examples to features: 100%|██████████| 33/33 [00:00<00:00, 2753.70it/s]
Converting examples to features: 100%|██████████| 11/11 [00:00<00:00, 2238.37it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2804.43it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 1258.61it/s]
Converting examples to features: 100%|██████████| 17/17 [00:00<00:00, 1644.10it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2423.33it/s]
Converting examples to features: 100%|██████████| 22/22 [00:00<00:00, 2543.68it/s]
Converting examples to features: 100%|██████████| 18/18 [00:00<00:00, 2373.16it/s]
Converti

Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2682.39it/s]
Converting examples to features: 100%|██████████| 9/9 [00:00<00:00, 2075.59it/s]
Converting examples to features: 100%|██████████| 11/11 [00:00<00:00, 2754.63it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2623.08it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2489.94it/s]
Converting examples to features: 100%|██████████| 20/20 [00:00<00:00, 2585.49it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2373.49it/s]
Converting examples to features: 100%|██████████| 6/6 [00:00<00:00, 2281.37it/s]
Converting examples to features: 100%|██████████| 7/7 [00:00<00:00, 2026.79it/s]
Converting examples to features: 100%|██████████| 53/53 [00:00<00:00, 2815.29it/s]
Converting examples to features: 100%|██████████| 11/11 [00:00<00:00, 2206.26it/s]
Converting examples to features: 100%|██████████| 15/15 [00:00<00:00, 2118.34it/s]
Converting e

Converting examples to features: 100%|██████████| 3/3 [00:00<00:00, 2265.56it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2085.16it/s]
Converting examples to features: 100%|██████████| 20/20 [00:00<00:00, 2415.66it/s]
Converting examples to features: 100%|██████████| 19/19 [00:00<00:00, 2917.94it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 3031.94it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2430.88it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2275.34it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2095.97it/s]
Converting examples to features: 100%|██████████| 17/17 [00:00<00:00, 2460.85it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 2670.26it/s]
Converting examples to features: 100%|██████████| 9/9 [00:00<00:00, 2548.52it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2254.03it/s]
Converting example

In [19]:
X_tra.shape, X_val.shape

((3552, 3, 3, 512), (384, 3, 3, 512))

In [20]:
y_tra.shape, y_val.shape

((3552, 1), (384, 1))

## Build Model

In [21]:
def BERTEmbeddingStack(x):
    embeds = []
    for art in tf.unstack(tf.reshape(x, (batch_size, 3, 2*ww+1, 512))):
        art = tf.cast(art, dtype="int32")
        # Below does not change the shape of segment_ids etc.
        # Only puts them into a dictionary
        bert_inputs = dict(
            input_ids=art[0],
            input_mask=art[1],
            segment_ids=art[2]
        )
        # Pooling
        result = bert(bert_inputs, signature="tokens", as_dict=True)["sequence_output"]
        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                             tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
        input_mask = tf.cast(art[1], tf.float32)
        pooled = masked_reduce_mean(result, input_mask)
        embeds.append(pooled)
    # print(tf.stack(embeds, 0))
    return tf.stack(embeds, 0)

In [22]:
def build_model_0(ww, max_seq_length):

    inp_size = 2 * ww + 1
    input_text = Input(shape=(3, inp_size, max_seq_length))

    bert_output = Lambda(BERTEmbeddingStack, output_shape=(None, None, inp_size, 768))(input_text)

    x = Bidirectional(LSTM(units=256, return_sequences=True))(bert_output)

    pred = LSTM(1, activation='softmax')(x)

    return Model(inputs=[input_text], outputs=pred)

In [23]:
learningrate = 2e-5
optimizer = Adam(lr=learningrate)
optimizer_str = 'adam'
loss = 'binary_crossentropy'
metrics = ['acc', f1_macro, f1_micro]

In [24]:
model = build_model_0(ww, max_seq_length)
model.summary()

model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

W1225 21:23:23.379186 139648198795392 deprecation.py:506] From /home/aorus/workspaces/simge/Master_Thesis/.env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1225 21:23:23.382309 139648198795392 deprecation.py:506] From /home/aorus/workspaces/simge/Master_Thesis/.env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1225 21:23:23.382824 139648198795392 deprecation.py:506] From /home/aorus/workspaces/simge/Master_Thesis/.env/lib/python3.6/site-packag

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 3, 3, 512)]       0         
_________________________________________________________________
lambda (Lambda)              (32, 3, 768)              0         
_________________________________________________________________
bidirectional (Bidirectional (32, 3, 512)              2099200   
_________________________________________________________________
lstm_1 (LSTM)                (32, 1)                   2056      
Total params: 2,101,256
Trainable params: 2,101,256
Non-trainable params: 0
_________________________________________________________________


In [25]:
epochs = 1

In [26]:
model_name = 'RQ2_test_bert_many_to_one_model_01' + \
             '_ww_' + str(ww) + \
             '_' + optimizer_str + \
             '_lr_' + str(learningrate) + \
             '_epochs_' + str(epochs) + \
             '_loss_' + loss + \
             '_onehot' + \
             '_softmax'

model_dir = './Model/' + model_name.split('model')[0] + 'model/' + model_name
results_file = os.path.join(model_dir, 'model_results_file.txt')

### Train Model

In [27]:
# Instantiate variables
initialize_vars(sess)

In [28]:
model.fit(X_tra, y_tra, 
          epochs=1,
          batch_size=batch_size, 
          validation_data=(X_val, y_val),
          callbacks=[
            PlotCurves(model_name=model_name, 
                       model_dir=model_dir, jnote=True)
          ])



<tensorflow.python.keras.callbacks.History at 0x7f00dd035f98>

<Figure size 720x360 with 0 Axes>

### Load the best Model

In [29]:
model_name

'RQ2_test_bert_many_to_one_model_01_ww_1_adam_lr_2e-05_epochs_1_loss_binary_crossentropy_onehot_softmax'

In [30]:
best_model = model

### Evaluation

In [31]:
def get_scores(model, data_, batch_size, ww, max_seq_length, results_file=None, print_out=False):

    X, y_true = get_input(data_, ww, max_seq_length, batch_size, limit=None)
    y_true = [y[0] for y in y_true]

    y_preds = model.predict(X, batch_size=batch_size)
    y_preds = [0 if y[0] < 0.5 else 1 for y in y_preds]

    clsrpt = classification_report(y_true, y_preds)
    sf1 = scikit_f1_score(y_true, y_preds)
    sfm = scikit_f1_score(y_true, y_preds, average='macro')

    if print_out:
        print(clsrpt)
        print('\nScikit_F1_Macro:', sfm)
        print('\nScikit_F1_1:', sf1)

    if results_file:
        with open(results_file, 'a') as f:
            f.write('\n' + clsrpt + '\nF1_Macro: ' + str(sfm) + '\nF1_1: ' + str(sf1) + '\n\n')
    return sfm


#### Validation Set

In [32]:
with open(results_file, 'w') as f:
    f.write('\n---------------- Validation ----------------\n')
val_f1 = get_scores(best_model, valid_data, batch_size, ww, max_seq_length, 
                    results_file, print_out=True)

Converting examples to features: 100%|██████████| 4/4 [00:00<00:00, 1434.20it/s]
Converting examples to features: 100%|██████████| 20/20 [00:00<00:00, 2376.31it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2168.27it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2289.56it/s]
Converting examples to features: 100%|██████████| 16/16 [00:00<00:00, 2621.54it/s]
Converting examples to features: 100%|██████████| 18/18 [00:00<00:00, 2667.76it/s]
Converting examples to features: 100%|██████████| 8/8 [00:00<00:00, 2488.65it/s]
Converting examples to features: 100%|██████████| 15/15 [00:00<00:00, 1871.51it/s]
Converting examples to features: 100%|██████████| 9/9 [00:00<00:00, 1873.67it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 2635.30it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 2743.02it/s]
Converting examples to features: 100%|██████████| 7/7 [00:00<00:00, 2641.01it/s]
Converting exa

              precision    recall  f1-score   support

           0       0.67      1.00      0.80       258
           1       0.00      0.00      0.00       126

   micro avg       0.67      0.67      0.67       384
   macro avg       0.34      0.50      0.40       384
weighted avg       0.45      0.67      0.54       384


Scikit_F1_Macro: 0.40186915887850466

Scikit_F1_1: 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Test Set

In [33]:
with open(results_file, 'a') as f:
    f.write('\n---------------- Test ----------------\n')
test_f1 = get_scores(best_model, test_data, batch_size, ww, max_seq_length,
                     results_file, print_out=True)

Converting examples to features: 100%|██████████| 23/23 [00:00<00:00, 3144.16it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 1988.36it/s]
Converting examples to features: 100%|██████████| 14/14 [00:00<00:00, 2223.75it/s]
Converting examples to features: 100%|██████████| 32/32 [00:00<00:00, 2430.07it/s]
Converting examples to features: 100%|██████████| 23/23 [00:00<00:00, 2447.33it/s]
Converting examples to features: 100%|██████████| 12/12 [00:00<00:00, 2078.45it/s]
Converting examples to features: 100%|██████████| 4/4 [00:00<00:00, 1725.87it/s]
Converting examples to features: 100%|██████████| 9/9 [00:00<00:00, 2131.97it/s]
Converting examples to features: 100%|██████████| 5/5 [00:00<00:00, 2025.06it/s]
Converting examples to features: 100%|██████████| 34/34 [00:00<00:00, 3161.09it/s]
Converting examples to features: 100%|██████████| 10/10 [00:00<00:00, 2505.71it/s]
Converting examples to features: 100%|██████████| 10/10 [00:00<00:00, 1837.11it/s]
Converting

              precision    recall  f1-score   support

           0       0.73      1.00      0.85       305
           1       0.00      0.00      0.00       111

   micro avg       0.73      0.73      0.73       416
   macro avg       0.37      0.50      0.42       416
weighted avg       0.54      0.73      0.62       416


Scikit_F1_Macro: 0.42302357836338417

Scikit_F1_1: 0.0
