In [1]:
from networkx.algorithms.traversal.depth_first_search import dfs_tree

import tensorflow as tf

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import json
import logging
import os
import numpy as np
import pandas as pd

import networkx as nx
import tensorflow_addons as tfa

from tensorflow.keras.preprocessing.sequence import pad_sequences  # type: ignore

from keras.models import Model
from keras.layers import Input, Embedding, Dense, TimeDistributed, Dropout,\
    Bidirectional, concatenate, SpatialDropout1D, GRU
from tensorflow.keras.utils import to_categorical  # type: ignore

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(
    filename='craft.log',
    level=logging.INFO,
    format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

In [8]:
# BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
BASE_DIR = '/content/drive/MyDrive/NEMO'
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "model_output", "Experiments")
DATASET_LOC = os.path.join(DATA_DIR, "model_input", "dataset")
direct_parent = os.path.join(DATA_DIR, "GO_Category", "GO_DirectParents.tsv")

go_category = ["GO:0008150", "GO:0005575", "GO:0003674"]

In [4]:
def get_sim(term1, term2):
    if "GO" in term1 and "GO" in term2:
        term1 = term1.replace("B-", "").replace("I-", "")
        term2 = term2.replace("B-", "").replace("I-", "")
        t1 = set(subsumers.get(term1, term1))
        t2 = set(subsumers.get(term2, term2))
        if len(set.union(t1, t2)) > 0:
            simj = len(set.intersection(t1, t2)) / len(set.union(t1, t2))
        else:
            simj = 0.0
    else:
        simj = 0.0
    return simj

In [5]:
def get_optimizer(opt, lr):
    if opt == 'adam':
        return tf.keras.optimizers.Adam(
            learning_rate=lr, beta_1=0.9, beta_2=0.999)
    elif opt == 'adamw':
        step = tf.Variable(0, trainable=False)
        schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            [10000, 15000], [1e-0, 1e-1, 1e-2])
        # lr and wd can be a function or a tensor
        lr = lr * schedule(step)

        def wd():
            return 1e-4 * schedule(step)
        # wd = lambda: 1e-4 * schedule(step)  # type: ignore
        return tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
    elif opt == 'rmsprop':
        return tf.keras.optimizers.RMSprop(learning_rate=lr)

In [6]:
def get_loss(loss):
    if 'categoricalCE' in loss:
        if 'logits' in loss:
            return tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        else:
            return tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    elif 'sigfocalCE' in loss:
        if 'logits' in loss:
            return tfa.losses.SigmoidFocalCrossEntropy(from_logits=True)
        else:
            return tfa.losses.SigmoidFocalCrossEntropy(from_logits=False)

In [9]:
logger.info('Creating ontology heirarchy')
direct_data = pd.read_csv(
    direct_parent, delimiter="\t",
    names=['Child', 'Parent']).replace({"_": ":"}, regex=True)
direct_data = direct_data.drop(0).reset_index(drop=True)
onto_digraph = nx.from_pandas_edgelist(
    direct_data, source='Child', target='Parent',
    create_using=nx.classes.digraph.DiGraph)
onto_info = "Number of nodes: {0}\nNumber of edges: {1}".format(
    onto_digraph.number_of_nodes(),
    onto_digraph.number_of_edges(),
)
print(onto_info)
logger.info(onto_info)

Number of nodes: 50860
Number of edges: 77512


In [10]:
logger.info('Creating list of subsumers')
subsumers = dict(
    (i, list(
        set(np.array(dfs_tree(onto_digraph, i).edges()).flatten().tolist()
            + [i]) - set(["owl:Thing"])
        )) for i in onto_digraph.nodes())

In [11]:
logger.info('Parameters and hyperparameters for model training')
config = {
    "weight": 0.5,
    "learning_rate": 0.001,
    "epochs": 200,
    "batch_size": 64,
    "activation": 'softmax', 
    "rdropout": 0.3,
    "optimizer": 'adamw',
    "loss": 'sigfocalCE',
    "callbacks": [
        {
            "early_stop": tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=10,
                verbose=1,
                restore_best_weights=True)
        },
    ],
    "learning_rate_func": 'cosinedecay',
    "max_len": 71,
    "max_char_len": 15,
    "min_sent_len": 3,
    "project": "Intelligent_OA",
    "extra_info": "CRAFT, inputs: Word(30D), POS(100D)",
    "dropout": 0.5,
    "name": "CRAFT"
}
print(config)
logger.info(config)

{'weight': 0.5, 'learning_rate': 0.001, 'epochs': 200, 'batch_size': 64, 'activation': 'softmax', 'rdropout': 0.3, 'optimizer': 'adamw', 'loss': 'sigfocalCE', 'callbacks': [{'early_stop': <keras.callbacks.EarlyStopping object at 0x7f02f7b8c1d0>}], 'learning_rate_func': 'cosinedecay', 'max_len': 71, 'max_char_len': 15, 'min_sent_len': 3, 'project': 'Intelligent_OA', 'extra_info': 'CRAFT, inputs: Word(30D), POS(100D)', 'dropout': 0.5, 'name': 'CRAFT'}


In [12]:
train_data = json.load(open(os.path.join(DATASET_LOC, "train.json"), "r"))
train_data = [
    i for i in train_data if len(i['tokens']) >= config.get("min_sent_len")
]

test_data = json.load(open(os.path.join(DATASET_LOC, "test.json"), "r"))
test_data = [
    i for i in test_data if len(i['tokens']) >= config.get("min_sent_len")
]
input_data = train_data + test_data

all_data = {
    "tokens": [i['tokens'] for i in input_data],
    "tags": [i['iob_tags'] for i in input_data],
    "pos": [i['pos_tags'] for i in input_data],
}
assert (
    len(all_data['tokens'])
    == len(all_data['tags'])
    == len(all_data['pos'])
)

In [13]:
print('Creating training and test dataset')
logger.info('Creating training and test dataset')
words = ["PAD"] + sorted(
    set([j for i in all_data['tokens'] for j in i] + ["UNK", "O"])
    - set(["PAD"]))
tags = ["PAD"] + sorted(
    set([j for i in all_data['tags'] for j in i] + ["UNK", "O"])
    - set(["PAD"]))
chars = ["PAD"] + sorted(
    set([j for i in words for j in i] + ["UNK", "O"])
    - set(["PAD"]))
pos = ["PAD"] + sorted(
    set([j for i in all_data['pos'] for j in i] + ["UNK", "O"])
    - set(["PAD"]))

n_words, n_tags, n_chars, = len(words), len(tags), len(chars)
n_pos = len(pos)

corpus_info = (
    "\nNumber of Observations:{0}\nNumber of words:{1}"
    "\nNumber of tags:{2}\nNumber of characters: {3}"
    "\nNumber of pos: {4}".format(
        len(all_data['tokens']), n_words, n_tags, n_chars, n_pos)
    )
print(corpus_info)
logger.info(corpus_info)

Creating training and test dataset

Number of Observations:28880
Number of words:34194
Number of tags:1775
Number of characters: 158
Number of pos: 50


In [14]:
word_to_idx = dict((i, idx) for idx, i in enumerate(words))
idx_to_word = dict((v, k) for k, v in word_to_idx.items())

tag_to_idx = dict((i, idx) for idx, i in enumerate(tags))
idx_to_tag = dict((v, k) for k, v in tag_to_idx.items())

char_to_idx = dict((i, idx) for idx, i in enumerate(chars))
idx_to_char = dict((v, k) for k, v in char_to_idx.items())

pos_to_idx = dict((i, idx) for idx, i in enumerate(pos))
idx_to_pos = dict((v, k) for k, v in pos_to_idx.items())

In [15]:
logger.info('Creating output labels: one hot encodings')
print('Creating output labels: one hot encodings')
Y_tags = [[tag_to_idx.get(i) for i in sent] for sent in all_data['tags']]
Y_tags = pad_sequences(
    maxlen=config.get("max_len"), sequences=Y_tags,
    value=tag_to_idx.get("PAD"), padding='post',
    truncating='post', dtype='float16')
Y_tags = to_categorical(Y_tags, num_classes=n_tags, dtype='float16')

logger.info('Creating semantic embedding from subsumers information')
print('Creating semantic embedding from subsumers information')
sem_dist = dict(
    [(i, to_categorical(i, num_classes=n_tags)) for i in range(n_tags)])
factor = 0
for i in range(n_tags):
    iob_i = None
    term_i = idx_to_tag.get(i)
    if "B-" in term_i or "I-" in term_i:
        iob_i = term_i[0]
    term_i = term_i.replace("B-", "").replace("I-", "")
    if "GO" in term_i:
        sem_scores = []
        for j in range(n_tags):
            iob_j = None
            term_j = idx_to_tag.get(j)
            if "B-" in term_j or "I-" in term_j:
                iob_j = term_j[0]
            term_j = term_j.replace("B-", "").replace("I-", "")
            score = config.get('weight') * get_sim(term_i, term_j)
            if iob_i != iob_j:
                score = factor * score
            sem_scores.append(score)
        sem_scores = np.array(sem_scores)
        sem_scores[i] = 1
        sem_dist[i] = sem_scores

for i in range(n_tags):
    num_max = np.where(sem_dist[i] == 1)[0].size
    assert num_max == 1

for i in range(len(Y_tags)):
    for j in range(config.get("max_len")):
        k = np.where(Y_tags[i][j] == 1)[0][0]
        Y_tags[i][j] = sem_dist[k]

Creating output labels: one hot encodings
Creating semantic embedding from subsumers information


In [16]:
logger.info('Creating input dataset')
print('Creating input dataset')
X_word = [[word_to_idx.get(w) for w in s] for s in all_data['tokens']]
X_word = pad_sequences(
    maxlen=config.get("max_len"), sequences=X_word,
    value=word_to_idx["PAD"], padding='post', truncating='post',
    dtype='float16')

X_char_temp = []
for wds in all_data['tokens']:
    wds = wds[:config.get("max_len")] + ["PAD"]*(
        config.get("max_len") - len(wds))
    chrs = [list(word)[:config.get("max_char_len")] + ["PAD"]*(
        config.get("max_char_len")-len(word))
        if word != "PAD" else ["PAD"]*config.get("max_char_len")
        for word in wds]
    X_char_temp.append(np.array(chrs))
X_char_temp = np.array(X_char_temp)
X_char = np.vectorize(char_to_idx.get)(X_char_temp).astype('float16')
del X_char_temp

X_pos = [[pos_to_idx.get(w) for w in s] for s in all_data['pos']]
X_pos = pad_sequences(
    maxlen=config.get("max_len"), sequences=X_pos,
    value=pos_to_idx.get("PAD"), padding='post', truncating='post',
    dtype='float16')

max_idx = 0
for i in range(len(X_word), 0, -1):
    if (
        i * 0.7 % config.get("batch_size") == 0 and
        i * 0.3 % config.get("batch_size") == 0
    ):
        max_idx = i
        break

combined = [(X_word[i], X_char[i], X_pos[i]) for i in range(max_idx)]
Y_tags = Y_tags[:max_idx]

Creating input dataset


In [17]:
logger.info('Dividing dataset into 80-20 split')
print('Dividing dataset into 80-20 split')
X_tr, X_te, y_tr, y_te = train_test_split(
    combined, Y_tags, test_size=0.3, random_state=2022)

input_train = []
for i in range(len(combined[0])):
    temp = []
    for j in range(len(X_tr)):
        temp.append(X_tr[j][i])
    input_train.append(np.array(temp, dtype='float16'))
input_train = tuple(input_train)

input_test = []
for i in range(len(combined[0])):
    temp = []
    for j in range(len(X_te)):
        temp.append(X_te[j][i])
    input_test.append(np.array(temp, dtype='float16'))
input_test = tuple(input_test)

X_train_dataset = tf.data.Dataset.from_tensor_slices(input_train)
X_test_dataset = tf.data.Dataset.from_tensor_slices(input_test)
Y_train_dataset = tf.data.Dataset.from_tensor_slices(y_tr)
Y_test_dataset = tf.data.Dataset.from_tensor_slices(y_te)

train_dataset = tf.data.Dataset.zip((X_train_dataset, Y_train_dataset))
test_dataset = tf.data.Dataset.zip((X_test_dataset, Y_test_dataset))

train_dataset = train_dataset.shuffle(1000).batch(config['batch_size'])
test_dataset = test_dataset.batch(config['batch_size'])

Dividing dataset into 80-20 split


In [18]:
logger.info('Defining deep learning architecture')
print('Defining deep learning architecture')

# input and embedding for words
word_in = Input(
    shape=(config.get("max_len"),), name="WORD")
emb_word = Embedding(
    input_dim=n_words, output_dim=30,
    input_length=config.get("max_len"), mask_zero=True)(word_in)

# input and embeddings for characters
char_in = Input(
    shape=(config.get("max_len"), config.get("max_char_len"),),
    name="CHAR")
emb_char = TimeDistributed(
    Embedding(
        input_dim=n_chars, output_dim=100,
        input_length=config.get("max_char_len"),
        mask_zero=True))(char_in)
char_enc = TimeDistributed(
    GRU(
        units=150, return_sequences=False, recurrent_dropout=0.5)
        )(emb_char)

# input and embeddings for pos
pos_in = Input(shape=(config.get("max_len"),), name="POS")
emb_pos = Embedding(
    input_dim=n_pos, output_dim=100, input_length=config.get("max_len"),
    mask_zero=True, name="EMB_POS")(pos_in)

# main LSTM
x = concatenate([emb_word, char_enc, emb_pos])
x = SpatialDropout1D(config.get("dropout"))(x)
main_lstm = Bidirectional(
    GRU(
        units=150, return_sequences=True,
        recurrent_dropout=config['rdropout']))(x)
main_lstm = TimeDistributed(Dense(3200, activation='relu'))(main_lstm)
main_lstm = Dropout(config.get("dropout"))(main_lstm)
out = TimeDistributed(
    Dense(
        n_tags, activation=config['activation']
    ), name="OUT_TAGS")(main_lstm)

model = Model([word_in, char_in, pos_in], out)
model.compile(
    optimizer=get_optimizer(config['optimizer'], config['learning_rate']),
    loss=get_loss(config['loss']), metrics=["acc"])
print(model.summary())
logger.info(model.summary())

Defining deep learning architecture
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 CHAR (InputLayer)              [(None, 71, 15)]     0           []                               
                                                                                                  
 WORD (InputLayer)              [(None, 71)]         0           []                               
                                                                                                  
 time_distributed (TimeDistribu  (None, 71, 15, 100)  15800      ['CHAR[0][0]']                   
 ted)                                                                                             
                                                                                                  
 POS (InputLayer)               [(None, 71)]         0    

In [19]:
model_arch = tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
logger.info('Starting model training')
print('Starting model training')
history = model.fit(
    train_dataset,
    batch_size=config['batch_size'],
    epochs=config['epochs'],
    validation_data=test_dataset,
    verbose=1,
    callbacks=[v for i in config.get("callbacks") for v in i.values()]
)

print('Model training complete')
logger.info('Model training complete')

In [21]:
del X_word, X_char, X_pos, Y_tags, combined, input_train, X_train_dataset
del Y_train_dataset, train_dataset

In [23]:
logger.info('Making predictions')
pred = []
step = 100
pbar = tqdm(total=len(X_te), desc="Making predictions:")
for i in range(int(len(X_te)/step)+1):
    inp = [j[i*step:(i+1)*step] for j in input_test]
    if inp[0].shape[0] != 0:
        temp = model.predict(inp)
    pred.append(temp)
    pbar.update(temp.shape[0])
pred = np.concatenate(pred)
pbar.close()

Making predictions:: 100%|██████████| 8640/8640 [00:27<00:00, 316.98it/s]


In [24]:
logger.info('Calculating F1 score and semantic similarity scores')
word = np.vectorize(idx_to_word.get)(input_test[0]).flatten()
ground_truth = np.vectorize(idx_to_tag.get)(
    np.argmax(y_te, axis=-1)).flatten()
predictions = (np.vectorize(idx_to_tag.get)(
    np.argmax(pred, axis=-1))).flatten().tolist()
two_predictions = (np.vectorize(idx_to_tag.get)(
    np.argsort(-1*pred, axis=-1)[:, :, :2])).reshape(
        pred.shape[0]*pred.shape[1], 2).tolist()

In [25]:
pd_data = pd.DataFrame({
    "Word": word,
    "Ground_Truth": ground_truth,
    "Prediction": predictions,
    "Top_Two_Predictions": two_predictions,
})

In [27]:
pd_data.drop(pd_data[pd_data['Ground_Truth'] == "PAD"].index, inplace=True)
pd_data.drop(
    pd_data[
        (pd_data["Ground_Truth"] == "O") &
        (pd_data["Prediction"] == "O")
    ].index, inplace=True)
pd_data.drop(
    pd_data[
        (pd_data["Ground_Truth"] == "EOS") &
        (pd_data["Prediction"] == "EOS")
    ].index, inplace=True)

pd_data['Comparison'] = pd_data.apply(
    lambda x: x[1] if x[1] in x[-1] else x[-1][-1], axis=1)

logger.info('Creating classification report')
top_report = classification_report(
    pd_data['Ground_Truth'],
    pd_data['Prediction'],
    zero_division=False,
    digits=4,
)
# print(top_report)
score_iob = ({
    "IOB_F1": top_report.splitlines()[-1].split()[-2],
    "IOB_Sim": np.round(
        pd_data[['Prediction', 'Ground_Truth']].apply(
            lambda x: get_sim(x[0], x[1]), axis=1).mean(), 4)
})
print(score_iob)
logger.info(score_iob)
df1 = pd_data.copy().replace({"B-GO:": "GO:", "I-GO:": "GO:"}, regex=True)
report = classification_report(
    df1['Ground_Truth'],
    df1['Prediction'],
    zero_division=False,
    digits=4,
)
# print(report)
score_top_one = ({
    "F1": report.splitlines()[-1].split()[-2],
    "Sim": np.round(
        df1[['Prediction', 'Ground_Truth']].apply(
            lambda x: get_sim(x[0], x[1]), axis=1).mean(), 4)
})
print(score_top_one)
logger.info(score_top_one)
df2 = df1.copy()
df2.drop(
    df2[
        (df2["Ground_Truth"] == "O") &
        (df2["Comparison"] == "O")
    ].index, inplace=True)
df2.drop(
    df2[
        (df2["Ground_Truth"] == "EOS") &
        (df2["Comparison"] == "EOS")
    ].index, inplace=True)

top_two_report = classification_report(
    df2['Ground_Truth'],
    df2['Comparison'],
    zero_division=False,
    digits=4,
)
# print(top_two_report)
score_top_two = ({
    "F1_Top_2": top_two_report.splitlines()[-1].split()[-2],
    "Sim_Top_2": np.round(
        df2[['Comparison', 'Ground_Truth']].apply(
            lambda x: get_sim(x[0], x[1]), axis=1).mean(), 4)
})
print(score_top_two)
logger.info(score_top_two)

{'IOB_F1': '0.1119', 'IOB_Sim': 0.1384}
{'F1': '0.1221', 'Sim': 0.1384}
{'F1_Top_2': '0.0478', 'Sim_Top_2': 0.2032}
