In [1]:
import numpy as np 
import pandas as pd
import os
import numpy as np
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [12]:
import tokenization

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

In [3]:
tf.__version__

'2.8.0'

In [4]:
device_name = tf.test.gpu_device_name()
print(device_name)
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

/device:GPU:0
Found GPU at: /device:GPU:0


In [21]:
train_data = pd.read_csv('Processed_Input.tsv', encoding='utf-8', sep = '\t')

In [22]:
train_data.fillna("", inplace = True)

In [23]:
# Shuffle data so that there is a higher chance of the train and test data being from the same distribution.
train_data = shuffle(train_data)

In [24]:
# Now read the rows, convert them into strings and then only keep the unique ones.
sentences_and_lables = np.array([[' '.join(map(str, row[:-1].tolist())).strip(), row[-1]] for row in train_data.iloc[:,:].values])
print(sentences_and_lables.shape)
sentences = sentences_and_lables[:, 0]
#labels = sentences_and_lables[:, 1]

(100120, 2)


In [8]:
label = preprocessing.LabelEncoder()
y = label.fit_transform(train_data['relation'])
# y = to_categorical(y) # doing this later.

In [25]:
print(y[:5])

[4 4 4 4 4]


In [26]:
print(sentences[:5])

['Awards  Best Production Design Metro Manila Film Festival Award for Best Production Design  Mimi Sanson Viola'
 'Filmography  Sher Dil Sher Dil Pakistani film  Punjabi'
 'Awards and nominations Best Crew Fiction Chinna Thambi'
 'Film Laura Harring Georgina Preston'
 'Discography  Uthaman Uthaman 1976 film   Tamil Tamil language']


In [11]:
m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(m_url, trainable=True)

In [12]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [13]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    print(tf.shape(sequence_output))
    clf_output = sequence_output[:, :, :]
    print(tf.shape(clf_output))
    
#     lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
#     lay = tf.keras.layers.Dropout(0.2)(lay)
#     lay = tf.keras.layers.Dense(32, activation='relu')(lay)
#     lay = tf.keras.layers.Dropout(0.2)(lay)
#     out = tf.keras.layers.Dense(2, activation='softmax')(lay)
    
#     clf_output = tf.reshape(clf_output.shape[1],1,)
    lay = tf.keras.layers.Conv1D(filters=32, kernel_size=5, strides=1, padding="same", activation="relu")(clf_output)
    lay = tf.keras.layers.MaxPooling1D(2, 2)(lay)
    lay = tf.keras.layers.LSTM(60, return_sequences=True, dropout=0.2)(lay)
    lay = tf.keras.layers.Flatten()(lay)
    out = tf.keras.layers.Dense(6, activation='softmax')(lay)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [14]:
max_len = 250
train_input = bert_encode(sentences, tokenizer, max_len=max_len)
train_labels = y

### Obtaining Train, test splits.
###### In the train splits, we will have a separate validation split.

In [None]:
splits = 5 # For five fold cross-validation.
seeds = [i for i in range(splits)]  # Fix the seed value for reproducibility.

data_dict = {}

# First get random train-test splits. Doesn't include validation, which will be obtained from the train set.
for seed in seeds:
    x_t, y_t, x_test, y_test = train_test_split(sentences, y, random_state=seed, test_size=0.2)   # Global training and test sets.
    
    # Now get validation sets from each training set.
    kf = KFold(n_splits=5, shuffle=False) # Setting shuffle=False because shuffled dataset already before.
    for train_index, val_index in kf.split(x_t):
        x_train, x_val = x_t[train_index], x_t[val_index]   # Training and validation features.
        y_train, y_val = y_t[train_index], y_t[val_index]   # Training and validation labels.
        
        # Validation sets can be used for hyperparamter tuning.
    

In [19]:
with tf.device('/device:GPU:0'):
    model = build_model(bert_layer, max_len=max_len)
    model.summary()
    checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
    earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)
    train_sh = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=1,
    callbacks=[checkpoint, earlystopping],
    batch_size=4,
    verbose=1)

KerasTensor(type_spec=TensorSpec(shape=(3,), dtype=tf.int32, name=None), inferred_value=[None, 250, 768], name='tf.compat.v1.shape_8/Shape:0', description="created by layer 'tf.compat.v1.shape_8'")
KerasTensor(type_spec=TensorSpec(shape=(3,), dtype=tf.int32, name=None), inferred_value=[None, 250, 768], name='tf.compat.v1.shape_9/Shape:0', description="created by layer 'tf.compat.v1.shape_9'")


ResourceExhaustedError: OOM when allocating tensor with shape[5,768,32] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

 328/2503 [==>...........................] - ETA: 21:19:22 - loss: 0.4703 - accuracy: 0.8000