In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt
# from IPython import display

In [4]:
from sklearn.metrics import classification_report

In [5]:
import tensorflow as tf
import tensorflow_hub as hub

In [6]:
from keras import backend as K

from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Lambda
from keras.regularizers import l2

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


#### Custom Functions

In [7]:
from src.callbacks import PlotCurves
from src.custom_functions import f1_macro, f1_micro 
from src.load_data import load_data

### Load Data

In [8]:
train_data, valid_data, test_data, metadata = load_data()

### Prepare data

In [1]:
max_seq_length = 512
if max_seq_length > 512:
    print('!!!!!!! WARNING: BERT does not accept lenght > 512')
    max_seq_length = 512

In [14]:
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [15]:
def split_data(data_, max_seq_length, bert_path, to_categorize):
    
    tokenizer = create_tokenizer_from_hub_module(bert_path)
    
    # !!! For BERT input, each sentence should be in an array
    X = np.array([[" ".join(sentence['sentence'].replace('\n', '').strip().split()[0:max_seq_length])]
                  for article in data_ 
                  for sentence in article['sentences']], dtype=object)

    y = [sentence['label'] 
                  for article in data_
                  for sentence in article['sentences']]
    
    examples_ = convert_text_to_examples(X, y)
    
    (input_ids, input_masks, segment_ids, labels_) = \
            convert_examples_to_features(tokenizer, examples_, max_seq_length=max_seq_length)
    
    if to_categorize:
        labels_ = to_categorical(labels_)
    
    return [input_ids, input_masks, segment_ids], labels_

In [16]:
X_tra, y_tra = split_data(train_data, max_seq_length, bert_path, True)
X_val, y_val = split_data(valid_data, max_seq_length, bert_path, True)
X_test, y_test = split_data(test_data, max_seq_length, bert_path, False)

W0901 23:56:44.645421 140486611460224 deprecation_wrapper.py:119] From /home/aorus/workspaces/simge/Master_Thesis/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

Converting examples to features: 100%|██████████| 3582/3582 [00:01<00:00, 2470.84it/s]
Converting examples to features: 100%|██████████| 399/399 [00:00<00:00, 2543.04it/s]
Converting examples to features: 100%|██████████| 441/441 [00:00<00:00, 2548.65it/s]


### Prepare data

In [12]:
def split_data(data_, max_len, n_tags, is_test=False):
    
    X = []
    for article in data_:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(article['sentences'][i]['sentence'])
            except:
                new_seq.append("ENDPAD")
        X.append(new_seq)
    X = np.array(X)
    
    if not is_test: 
        y = [[sent['label'] for sent in article['sentences']] for article in data_]
        y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=0)
        y = np.array([[to_categorical(y, num_classes=n_tags) for y in sent] for sent in y])
    else:
        y = np.array([sent['label'] for article in data_ for sent in article['sentences']])
    
    return X, y

In [15]:
X_tra, y_tra = split_data(train_data, max_len, n_tags, False)
X_val, y_val = split_data(valid_data, max_len, n_tags, False)
X_test, y_test = split_data(test_data, max_len, n_tags, True)

In [16]:
y_tra.shape, y_val.shape, y_test.shape

((251, 60, 2), (32, 60, 2), (441,))