In [1]:
# constants
PADDING_SIZE = 100
BATCH_SIZE = 32
EPOCHS = 5
CHECKPOINT_PATH_REL = "training_rel/weights.ckpt"
SAMPLE_SIZE = 0.5
EMBEDDING_DIMENSIONS = 50  # based on glove
PATIENCE = 5
VECTOR_VALUE = 4.0
DATASET = "webnlg"

In [2]:
import sys
from tqdm import tqdm
import numpy as np
import json
import time
import random
import tensorflow as tf
import tensorflow.keras.layers as layers

2023-09-20 21:57:42.536361: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-20 21:57:42.760003: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
precision_metric = tf.keras.metrics.Precision(name="precision")
recall_metric = tf.keras.metrics.Recall(name="recall")

class_counter = {}

vectors = {}
with tf.device('/CPU:0'):
    with open("glove.6B." + str(EMBEDDING_DIMENSIONS) + "d.txt", "r") as f:
        for i, line in enumerate(tqdm(f)):
            vals = line.rstrip().split(" ")
            vectors[vals[0]] = [float(x) for x in vals[1:]]

dict_keys = list(vectors.keys())
print("### vocab size", len(vectors.keys()))

2023-09-20 21:57:46.920480: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-20 21:57:46.973745: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-20 21:57:46.973955: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

### vocab size 400000
3.2582





In [None]:
def get_glove_embedding(word):
    case_vector = VECTOR_VALUE if word[0].isupper() else (-1 * VECTOR_VALUE)
    wrd = str(word).lower()
    if wrd in vectors:
        return vectors[wrd] + [case_vector]

    char_embeddings = [
        vectors.setdefault(c, vectors[dict_keys[ord(c) % 100]]) for c in wrd
    ]
    averaged_embeddings = []
    tot = 0
    for i in range(EMBEDDING_DIMENSIONS):
        for j in range(len(char_embeddings)):
            tot += char_embeddings[j][i]
        averaged_embeddings.append(tot)

    return averaged_embeddings + [case_vector]

In [None]:
def preprocess_relations(paths):
    all_relations = []
    for path in paths:
        f = open(path)
        corpus = json.load(f)
        f.close()
        for i in range(int(len(corpus) * 1)):
            sentence_data = corpus[i]
            for trpl in sentence_data["relation_list"]:
                rel = trpl["predicate"]
                all_relations.append(rel)

    relations_list = list(set(all_relations))
    print(f"### Number of relations: {len(relations_list)} \t\t ###")
    return relations_list

In [None]:
def preprocess_data(path: str, size = 1.0):
    f = open(path)
    corpus = json.load(f)
    f.close()

    X = {
        "sentences": [],
        "embeddings": [],  # for relations task
    }
    Y = {
        "relations": [],
        "relations_text": [],
    }

    for i in tqdm(range(int(len(corpus) * size))):
        sentence_data = corpus[i]
        tokens = sentence_data["text"].split(" ")
        X["sentences"].append(sentence_data["text"])
        embeddings = []
        relations_tagged = [0] * len(relations_vocab)
        relations_text = []
        sub_ranges = []
        obj_ranges = []

        for trpl in sentence_data["relation_list"]:
            rel = trpl["predicate"]
            relations_text.append(rel)
            assert rel in relations_vocab, "Relation should be found."
            rel_index = relations_vocab.index(rel)
            relations_tagged[rel_index] = 1

            # subject ranges
            sub_list = trpl["subject"].split(" ")
            sub_head_index = tokens.index(sub_list[0])
            sub_tail_index = tokens.index(sub_list[-1], sub_head_index)
            assert sub_tail_index != -1
            sub_ranges.append([sub_head_index, sub_tail_index])

            # object ranges
            obj_list = trpl["object"].split(" ")
            obj_head_index = tokens.index(obj_list[0])
            obj_tail_index = tokens.index(obj_list[-1], obj_head_index)
            assert obj_tail_index != -1
            obj_ranges.append([obj_head_index, obj_tail_index])

        Y["relations"].append(relations_tagged)
        # remove duplicates
        relations_text = list(set(relations_text))
        Y["relations_text"].append("**".join(relations_text))

        # iterate over words
        for j, tkn in enumerate(tokens):
            is_sub = is_obj = False
            for s_rng in sub_ranges:
                if j >= s_rng[0] and j <= s_rng[1]:
                    is_sub = True
                    break
            for o_rng in obj_ranges:
                if j >= o_rng[0] and j <= o_rng[1]:
                    is_obj = True
                    break

            if is_sub:
                embeddings.append(get_glove_embedding(tkn) + [VECTOR_VALUE])
            elif is_obj:
                embeddings.append(get_glove_embedding(tkn) + [-1 * VECTOR_VALUE])
            else:
                embeddings.append(get_glove_embedding(tkn) + [0])

        padding_filling = (PADDING_SIZE - len(embeddings)) * [
            (EMBEDDING_DIMENSIONS + 1 + 1) * [0.0]
        ]
        embeddings.extend(padding_filling)
        X["embeddings"].append(embeddings)

    # to numpy to get memory space
    for k in X.keys():
        X[k] = np.array(X[k])
        # X[k] = tf.constant(X[k])
        print(
            f"{k}\tshape\t{ X[k].shape }\tsize\t{sys.getsizeof(X[k]) / 1024 / 1024:.4} MB",
        )

    for k in Y.keys():
        Y[k] = np.array(Y[k])
        # Y[k] = tf.constant(Y[k])
        print(
            f"{k}\tshape\t{ Y[k].shape }\tsize\t{sys.getsizeof(Y[k]) / 1024 / 1024:.4} MB",
        )

    return X, Y

In [None]:
relations_vocab = preprocess_relations(
    [
        "data/" + DATASET + "/train.json",
        "data/" + DATASET + "/valid.json",
        "data/" + DATASET + "/test.json",
    ]
)

train_X, train_Y = preprocess_data("data/" + DATASET + "/train.json", SAMPLE_SIZE)
valid_X, valid_Y = preprocess_data("data/" + DATASET + "/valid.json", 1)
test_X, test_Y = preprocess_data("data/" + DATASET + "/test.json", 1)

In [None]:
class DiceLoss(tf.keras.losses.Loss):
    def __init__(self, smooth=1e-6, gama=2):
        super(DiceLoss, self).__init__()
        self.name = 'NDL'
        self.smooth = smooth
        self.gama = gama

    def call(self, y_true, y_pred):
        y_true, y_pred = tf.cast(y_true, dtype=tf.float32), tf.cast(y_pred, tf.float32)
        return tf.where(
            tf.logical_and(
                tf.equal(y_true, tf.constant(0.0)),
                tf.logical_and(
                tf.less(y_pred, tf.constant(0.5)),
                tf.greater(y_pred, tf.constant(-1.0)),
                )
            ),
            tf.divide(
            # tf.divide(
                #nominator
                self.smooth ** 2,
                #denominator
                tf.reduce_sum(y_pred ** self.gama) + tf.reduce_sum(y_true ** self.gama) + self.smooth
                )
             , 
            1 - tf.divide(
                #nominator
                2 * tf.reduce_sum(tf.multiply(y_pred, y_true)) + self.smooth,
                #denominator
                tf.reduce_sum(y_pred ** self.gama) + tf.reduce_sum(y_true ** self.gama) + self.smooth
                )
        )

    def call2(self, y_true, y_pred):
        y_true, y_pred = tf.cast(y_true, dtype=tf.float32), tf.cast(y_pred, tf.float32)
        return 1 - tf.divide(
                #nominator
                2 * tf.reduce_sum(tf.multiply(y_pred, y_true)) + self.smooth,
                #denominator
                tf.reduce_sum(y_pred ** self.gama) + tf.reduce_sum(y_true ** self.gama) + self.smooth
                )


In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, patience, decay):
        self.best_f1 = 0.0
        self.best_weights = None
        self.noprogress_counter = 0
        self.patience = patience
        self.decay = decay

    def on_epoch_end(self, epoch, logs=None):
        # Get the current learning rate from model's optimizer.
        lr = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))
        lr -= self.decay
        tf.keras.backend.set_value(self.model.optimizer.lr, lr)
        print(f"\nLearning rate now is %6.6f." % (lr))

        f1 = (2 * logs["val_precision"] * logs["val_recall"]) / (
            logs["val_precision"] + logs["val_recall"] + 0.000001
        )
        if f1 > self.best_f1:
            self.best_f1 = f1
            print(f"\nValidation f1 {f1:.4} epoch {epoch}")
            self.best_weights = self.model.get_weights()
            self.noprogress_counter = 0

        elif self.noprogress_counter > self.patience:
            self.model.set_weights(self.best_weights)
            self.model.stop_training = True

        else:
            self.noprogress_counter += 1

    def on_train_end(self, logs=None):
        self.model.set_weights(self.best_weights)

In [None]:
def create_relations_model(params):
    inputs = layers.Input(shape=(PADDING_SIZE, EMBEDDING_DIMENSIONS + 1 + 1))

    bilstm = layers.Bidirectional(
        layers.LSTM(
            params[0],
            return_sequences=True,
            activation="tanh",
        )
    )(inputs)

    # value = SelfAttention(is_residual = True,
    # attention_activation='relu'
    #  )(bilstm)

    avg1 = layers.AveragePooling1D(
        pool_size=params[1], strides=params[2], padding="same"
    )(bilstm)

    flt = layers.Flatten()(avg1)

    dropout_layer = layers.Dropout(params[3])(flt)

    output_layer = layers.Dense(
        len(relations_vocab), name="rel_output", activation=None
    )(dropout_layer)

    model = tf.keras.Model(
        inputs=[inputs],
        outputs=[
            output_layer,
        ],
    )
    model.compile(
        optimizer=tf.optimizers.Adam(
            learning_rate=params[4],
        ),
        loss=DiceLoss(),
        # loss = 'binary_crossentropy',
        # loss = 'categorical_crossentropy',
        # loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=[precision_metric, recall_metric],
    )
    print(params, model.summary())
    return model

In [None]:
def run_relations_training(params):
    
    np.random.seed(random.randint(100, 9999))
    random.seed(random.randint(100, 9999))
    tf.random.set_seed(random.randint(100, 9999))
    relations_model = create_relations_model(params)
    relationsCallback = CustomCallback(PATIENCE, (params[4] / EPOCHS))
    history = relations_model.fit(
        x=train_X["embeddings"],
        y=train_Y["relations"],
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(
            valid_X["embeddings"],
            valid_Y["relations"],
        ),
        verbose=1,
        callbacks=relationsCallback,
    )

    # save model
    relations_model.save_weights(CHECKPOINT_PATH_REL)

    # print(history.history, sep='\n\n####\n\n')

    for k, v in enumerate(history.history.items()):
        print(k)
        print(v)

    return None

    res = relations_model.evaluate(
        x=test_X["embeddings"],
        y=test_Y["relations"],
        batch_size=BATCH_SIZE,
    )

    f1 = 2 * res[1] * res[2] / (res[1] + res[2])

    print(f"test f1 {f1:.4}")

In [None]:
def tune_relations():
    done_list = []

    bilstm = [
        500,
    ]
    avg1 = [50, 80, 100]
    avg2 = [2, 3, 5, 7, 10]
    drop = [
        0.15,
    ]
    lr = [
        0.0015,
        0.001,
        0.0005,
    ]

    for a in bilstm:
        for b in avg1:
            for e in lr:
                for d in drop:
                    for c in avg2:
                        params = [a, b, c, d, e]

                        if params in done_list:
                            continue

                        start_time = time.time()

                        np.random.seed(random.randint(100, 9999))
                        random.seed(random.randint(100, 9999))
                        tf.random.set_seed(random.randint(100, 9999))
                        relations_model = create_relations_model(params)
                        relationsCallback = CustomCallback(PATIENCE, (e / EPOCHS))
                        relations_model.fit(
                            x=train_X["embeddings"],
                            y=train_Y["relations"],
                            batch_size=BATCH_SIZE,
                            epochs=EPOCHS,
                            validation_data=(
                                valid_X["embeddings"],
                                valid_Y["relations"],
                            ),
                            verbose=0,
                            callbacks=relationsCallback,
                        )

                        res = relations_model.evaluate(
                            x=test_X["embeddings"],
                            y=test_Y["relations"],
                            batch_size=BATCH_SIZE,
                        )
                        finish_time = time.time()
                        print("params", params)
                        print("time", str((finish_time - start_time) / 60))
                        print("test f1 ", (2 * res[1] * res[2] / (res[1] + res[2])))

In [None]:
def evaluate(params):
    test_X, test_Y = preprocess_data("data/" + DATASET + "/test.json", 1)

    relations_model = create_relations_model(params)
    relations_model.load_weights(CHECKPOINT_PATH_REL)
    predictions = relations_model.predict(
        x=test_X["embeddings"],
        batch_size=BATCH_SIZE,
    )

    ground_truth_count = predicted_count = correct_count = 0

    for i, ground_truth in enumerate(test_Y["relations_text"]):
        ground_truth_rel_list = ground_truth.split("**")
        ground_truth_count += len(ground_truth_rel_list)

        for j, pred_probability in enumerate(predictions[i]):
            if pred_probability >= 0.5:
                predicted_count += 1
                rel = relations_vocab[j]
                if rel in ground_truth_rel_list:
                    correct_count += 1

    precision = correct_count / predicted_count
    recall = correct_count / ground_truth_count
    f1_score = 2 * precision * recall / (precision + recall)

    print("\n\n\n\n")
    print(
        f"correct_count:{correct_count}, predicted_count:{predicted_count}, ground_truth_count:{ground_truth_count}"
    )
    print(f"precision :{precision}, recall :{recall}, f1 :{f1_score}")
    print("\n\n\n\n")

In [None]:
with tf.device('/GPU:0'):
    rel_model_params = [500,
        80,
        2,
        0.15,
        0.0015,
    ]
    run_relations_training(rel_model_params)
    # tune_relations()
    evaluate(rel_model_params)

In [None]:
# import tensorflow as tf
# mnist = tf.keras.datasets.mnist

# (x_train, y_train),(x_test, y_test) = mnist.load_data()
# x_train, x_test = x_train / 255.0, x_test / 255.0

# model = tf.keras.models.Sequential([
#   tf.keras.layers.Flatten(input_shape=(28, 28)),
#   tf.keras.layers.Dense(128, activation='relu'),
#   tf.keras.layers.Dropout(0.2),
#   tf.keras.layers.Dense(10, activation='softmax')
# ])

# model.compile(optimizer='adam',
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])

# model.fit(x_train, y_train, epochs=5)
# model.evaluate(x_test, y_test)