##Mount Drive and Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/nlp_hw3/code

/content/drive/My Drive/nlp_hw3/code


In [0]:
import os
import numpy as np
from typing import Tuple, List, Dict

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.python.eager import context
import json
import pandas as pd
from nltk.corpus import stopwords

import my_utils as utils
import corpora

##Preprocessing

In [0]:
def load_train_dataset(input_path: str, y_path: str) -> Tuple[List[str], List[str]]:
    """
    :param input_path; Path to the input dataset
    :param label_path; Path to the file containing the corresponding labels for the input dataset
    :return sentences; List of sentences in input_file
    :return labels; List of corresponding word segment codes in label_path. Same len as sentences
    """
    sentences = []
    k = 0
    with open(input_path, "r", encoding="utf-8-sig") as file:
        for line in file:
            k += 1
            sentences.append(line.strip())
#             if (k >= 4000):
#                 break

    y = []
    k = 0
    with open(y_path, "r", encoding="utf-8-sig") as file:
        for line in file:
            k += 1
            y.append(line.strip())
#             if (k >= 4000):
#                 break

    return sentences, y


def make_X_vocab(sentences: List[str]) -> Dict[str, int]:
    '''
    :param sentences; List of input sentences from the dataset
    :return unigrams_vocab; Dictionary from unigram to int
    :return bigrams_vocab; Dictionary from bigram to int
    '''
    vocab = {"UNK": 0}

    for sentence in sentences:
        for word in sentence.split():
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab


def make_Y_vocab(y: List[str]) -> Dict[str, int]:
    """
    :param labels; List of label codes
    :return labels_vocab; Dictionary from label code to int 
    """
    y_vocab = {"UNK": 0}
    
    for y_line in y:
        for y_word in y_line.split():
            if y_word not in y_vocab:
                y_vocab[y_word] = len(y_vocab)
                
    return y_vocab

def make_Y(output: List[str], output_vocab: Dict[str, int]) -> np.ndarray:
    """
    :param labels; List of word segment codes, line by line
    :param labels_vocab; Label codes vocab
    :return y; Vector of label code indices
    """
    y = []
    for output_line in output:
        y_temp = []
        for single_output in output_line.split():
            if single_output in output_vocab:
                y_temp.append( output_vocab[single_output])
            else:
                y_temp.append( output_vocab["OTHERS"])
        y.append(np.array(y_temp))
    
    return np.array(y)

In [0]:
train_data_path = '../data/Training_Corpora/semcor'
test_data_path = '../data/Evaluation_Datasets/senseval3'

#Parse training data
corpora_xml_path = train_data_path + '/semcor.data.xml'
gold_mapping_path = train_data_path + '/semcor.gold.key.txt'
resources_path = '../resources/'

corpora.extract_training_data(corpora_xml_path, gold_mapping_path, resources_path, train_data_path)

In [0]:
#Parse validation data
corpora_xml_path = test_data_path + '/senseval3.data.xml'
gold_mapping_path = test_data_path + '/senseval3.gold.key.txt'
resources_path = '../resources/'

corpora.extract_training_data(corpora_xml_path, gold_mapping_path, resources_path, test_data_path)

In [0]:
sentences, y = load_train_dataset(train_data_path+"/trainX.txt", train_data_path+"/trainy.txt")
_, y_dom = load_train_dataset(train_data_path+"/trainX.txt", train_data_path+"/trainy_dom.txt")
_, y_lex = load_train_dataset(train_data_path+"/trainX.txt", train_data_path+"/trainy_lex.txt")

test_sentences, test_y = load_train_dataset(test_data_path+"/trainX.txt", test_data_path+"/trainy.txt")
_, test_y_dom = load_train_dataset(test_data_path+"/trainX.txt", test_data_path+"/trainy_dom.txt")
_, test_y_lex = load_train_dataset(test_data_path+"/trainX.txt", test_data_path+"/trainy_lex.txt")

In [16]:
print(len(sentences))
print(sentences[0])

37168
How long has it been since you reviewed the objectives of your benefit and service program


In [17]:
print(len(y))
print(y[0])

37168
how bn:00106124a have it bn:00083181v since you bn:00092618v the bn:00002179n of you bn:00009904n and bn:00070654n bn:00064646n


In [18]:
vocab = make_X_vocab(sentences+test_sentences)
print(len(vocab))

48989


In [19]:
y_vocab = make_Y_vocab(y + test_y)
y_dom_vocab = make_Y_vocab(y_dom + test_y_dom)
y_lex_vocab = make_Y_vocab(y_lex + test_y_lex)
print(len(y_vocab))
print(len(y_dom_vocab))
print(len(y_lex_vocab))

49224
23396
23290


In [0]:
X = utils.make_X(sentences, vocab)
y_array = make_Y(y, y_vocab)
y_dom_array = make_Y(y_dom, y_dom_vocab)
y_lex_array = make_Y(y_lex, y_lex_vocab)

X_test = utils.make_X(test_sentences, vocab)
y_test_array = make_Y(test_y, y_vocab)
y_dom_test_array = make_Y(test_y_dom, y_dom_vocab)
y_lex_test_array = make_Y(test_y_lex, y_lex_vocab)

In [21]:
X.shape

(37168,)

In [0]:
train_x = pad_sequences(X, truncating='pre', padding='post', maxlen=30)
train_y = pad_sequences(y_array, truncating='pre', padding='post', maxlen=30)
train_y_dom = pad_sequences(y_dom_array, truncating='pre', padding='post', maxlen=30)
train_y_lex = pad_sequences(y_lex_array, truncating='pre', padding='post', maxlen=30)

dev_x = pad_sequences(X_test, truncating='pre', padding='post', maxlen=30)
dev_y = pad_sequences(y_test_array, truncating='pre', padding='post', maxlen=30)
dev_y_dom = pad_sequences(y_dom_test_array, truncating='pre', padding='post', maxlen=30)
dev_y_lex = pad_sequences(y_lex_test_array, truncating='pre', padding='post', maxlen=30)

In [23]:
print(train_x.shape)
print(train_y.shape)
print(train_y_dom.shape)
print(train_y_lex.shape)

(37168, 30)
(37168, 30)
(37168, 30)
(37168, 30)


In [24]:
train_y = train_y.reshape((*train_y.shape, 1))
dev_y = dev_y.reshape((*dev_y.shape, 1))

train_y_dom = train_y_dom.reshape((*train_y_dom.shape, 1))
dev_y_dom = dev_y_dom.reshape((*dev_y_dom.shape, 1))

train_y_lex = train_y_lex.reshape((*train_y_lex.shape, 1))
dev_y_lex = dev_y_lex.reshape((*dev_y_lex.shape, 1))

print(train_y.shape)
print(dev_y.shape)

(37168, 30, 1)
(352, 30, 1)


In [25]:
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)

(37168, 30)
(37168, 30, 1)
(352, 30)
(352, 30, 1)


##Building the model

In [0]:
vocab_size = len(vocab)

In [0]:
#This class helps with logging
class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./logs', **kwargs):
        self.val_log_dir = os.path.join(log_dir, 'multitask_wsd_3vocab/validation')
        training_log_dir = os.path.join(log_dir, 'multitask_wsd_3vocab/training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

    def set_model(self, model):
        if context.executing_eagerly():
            self.val_writer = tf.contrib.summary.create_file_writer(self.val_log_dir)
        else:
            self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def _write_custom_summaries(self, step, logs=None):
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if 'val_' in k}
        if context.executing_eagerly():
            with self.val_writer.as_default(), tf.contrib.summary.always_record_summaries():
                for name, value in val_logs.items():
                    tf.contrib.summary.scalar(name, value.item(), step=step)
        else:
            for name, value in val_logs.items():
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.val_writer.add_summary(summary, step)
        self.val_writer.flush()

        logs = {k: v for k, v in logs.items() if not 'val_' in k}
        super(TrainValTensorBoard, self)._write_custom_summaries(step, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()

In [0]:
#Please take note that most of this part was extracted from class exercises, with some additions

def create_keras_model(vocab_size, y_size, embedding_size=128, hidden_size=512):
    print("Creating KERAS model")


    model_input = Input(shape=(None,))
    embedding = Embedding(vocab_size, embedding_size, mask_zero=True)(model_input)
    lstm1 = Bidirectional(LSTM(hidden_size, dropout=0.3, recurrent_dropout=0.3, return_sequences=True), merge_mode='concat')(embedding)
    lstm2 = Bidirectional(LSTM(hidden_size, dropout=0.3, recurrent_dropout=0.3, return_sequences=True), merge_mode='concat')(lstm1)

    output1 = TimeDistributed(Dense(y_size[0], activation='softmax'))(lstm2)
    output2 = TimeDistributed(Dense(y_size[1], activation='softmax'))(lstm2)
    output3 = TimeDistributed(Dense(y_size[2], activation='softmax'))(lstm2)

    model = Model(inputs=model_input, outputs=[output1, output2, output3])


    #     model = K.models.Sequential()
    #     model.add(Embedding(vocab_size, embedding_size, mask_zero=True))

    #     model.add(Bidirectional(LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))
    #     model.add(Bidirectional(LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))
    #     #model.add(Bidirectional(LSTM(hidden_size, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), merge_mode='concat'))

    #     #Multi-task learning outputs
    #     output1 = TimeDistributed(Dense(y_size, activation='softmax'))
    #     output2 = TimeDistributed(Dense(y_size, activation='softmax'))
    #     output3 = TimeDistributed(Dense(y_size, activation='softmax'))

    #     model.add([output1, output2, output3])
    optimizer = K.optimizers.Adam()

    model.compile(loss=['sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', 'sparse_categorical_crossentropy'], optimizer=optimizer, metrics=['acc'])

    return model
    

In [0]:
resource_path = "../resources/"

In [30]:
batch_size = 64
epochs = 20
model_name = resource_path+"mult2_model.hdf5"

#checks if the FINAL model was saved and loads it instead of creating a new one
if os.path.exists(model_name):
    model = load_model(model_name)
    print("Using a pre-saved model")
    model.summary()
    
else:
    model = create_keras_model(vocab_size, [len(y_vocab), len(y_dom_vocab), len(y_lex_vocab)])
    print("Training a new model")
    model.summary()
    
    #filepath = resource_path+"models/model-{epoch:02d}.hdf5"
    filepath = resource_path+"models/model_multitask2.hdf5"
    checkpoint = K.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
    #callbacks_list = [checkpoint]
    tb_callback = tf.keras.callbacks.TensorBoard(log_dir="logs", histogram_freq=0, write_graph=False, write_images=True)
    callbacks_list = [TrainValTensorBoard(write_graph=False), checkpoint]
    
    print("\nStarting training...")
    model.fit(train_x, [train_y, train_y_dom, train_y_lex], epochs=epochs, batch_size=batch_size,
              shuffle=True, validation_data=(dev_x, [dev_y, dev_y_dom, dev_y_lex]), callbacks=callbacks_list) 
    print("Training complete.\n")
    
    #Save the FINAL model for later reuse
    model.save(model_name)
    print("Trained model saved for later use")

    

Creating KERAS model
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Training a new model
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0       

In [0]:
#Writing the vocabularies to file

with open(resource_path+"mult2_x_vocab.txt", "w") as file:
    file.write(json.dumps(vocab))
    
with open(resource_path+"mult2_y_vocab.txt", "w") as file:
    file.write(json.dumps(y_vocab))
    
with open(resource_path+"mult2_y_lex_vocab.txt", "w") as file:
    file.write(json.dumps(y_lex_vocab))

with open(resource_path+"mult2_y_dom_vocab.txt", "w") as file:
    file.write(json.dumps(y_dom_vocab))

##Running Predictions

In [0]:
# #from predict import *
from score import *

In [5]:
import nltk
nltk.download("wordnet")


from nltk.corpus import wordnet as wn
from tensorflow.keras.models import *
import os
import json

from corpora import extract_eval_data
from my_utils import *

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
def predict_babelnet(input_path : str, output_path : str, resources_path : str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n").
    
    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.
    
    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """
    if (not resources_path.endswith("/")):
        resources_path = resources_path+"/"

    input_folder_path = input_path
    corpora_xml_path = input_path
    
    if (input_path.endswith(".xml")):
        input_folder_path = "/".join(input_path.split("/")[0:-1])

    if (not input_folder_path.endswith("/")):
        input_folder_path = input_folder_path+"/"

    if (os.path.isfile(output_path)):
        pred_file = output_path
        output_folder_path = "/".join(output_path.split("/")[0:-1])+"/"
    elif (os.path.isdir(output_path)):
        if (not output_path.endswith("/")):
            output_folder_path = output_path+"/"
        pred_file = output_folder_path+"pred_babelnet.txt"

    model_name = resources_path+"mult2_model.hdf5"
    print("LOADING RESOURCES...")
    model = load_model(model_name)

    #load the saved vocabularies
    with open(resources_path+"mult2_x_vocab.txt", 'r') as file:
        x_vocab = file.read()
    x_vocab = json.loads(x_vocab)

    with open(resources_path+"mult2_y_vocab.txt", 'r') as file:
        y_vocab = file.read()
    y_vocab = json.loads(y_vocab)
    id_to_words = {v:k for k, v in y_vocab.items()}

    bn2wn_mapping = load_bn2wn_mapping(resources_path+"babelnet2wordnet.tsv", True)

    #preparing the input data for prediction
    print("PREPARING EVALUATION DATA FOR PREDICTION...")
    extract_eval_data(corpora_xml_path, resources_path)

    sentences = load_test_dataset(input_folder_path+"sentences.txt")
    X_ = make_X(sentences, x_vocab)

    sentences_instances = load_sentence_instances(input_folder_path+"inst_temp_file.txt")

    #predicting and writing to file
    print("Predicting (line by line) and writing to file... This may take a little while...")
    k = 0
    inst_index = 0
    x_len = X_.shape
    with open(pred_file, "w") as file:
        for x in X_:
            if x.size != 0:
                x__ = np.expand_dims(x, axis=0)
                y_pred = model.predict(x__)
                
                y_pred = y_pred[0]

                # This loop is meant to handle one instance at a time, saved temporarily in the inst_temp_file.txt file,
                # loaded into the sentences_instances variable, till there's no instance left
                while True:
                    assoc_bn_synsets_vocab_pos = []
                    if inst_index not in sentences_instances:
                        break

                    inst = sentences_instances[inst_index]
                    if (int(inst[2]) != k):
                        break
                    else:
                        inst_index += 1
                        inst_pos_in_sent = int(inst[3])
                        inst_id = inst[1]

                        # Getting associated senses to the lemma of the instance selected
                        inst_synsets = wn.synsets(inst[0])
                        for wn_synset in inst_synsets:
                            wn_synset_id = "wn:" + str(wn_synset.offset()).zfill(8) + wn_synset.pos()
                            if wn_synset_id in bn2wn_mapping and bn2wn_mapping[wn_synset_id] in y_vocab:
                                assoc_bn_synsets_vocab_pos.append(y_vocab[bn2wn_mapping[wn_synset_id]])
                        
                        # Finding argmax over all associated synsets, and defaulting to MFS (pre saved to the vocab) where there's none
                        if assoc_bn_synsets_vocab_pos:
                            pred_word = y_pred[0, inst_pos_in_sent]
                            synset_probs = []
                            for pos in assoc_bn_synsets_vocab_pos:
                                synset_probs.append(pred_word[pos])

                            pred_sense = id_to_words[assoc_bn_synsets_vocab_pos[np.argmax(synset_probs)]]
                        else:
                            #MFS word = inst[0]
                            pred_sense = bn2wn_mapping[wn_mfs(inst[0])]

                        file.write("{} {}\n".format(inst_id, pred_sense))
            
            k = k+1
            if k % 100 < 1:
                print ("%d/%d lines done... A little moment more and everything will be done! :)" % (k,x_len[0]));

    del model, x_vocab, y_vocab, id_to_words, bn2wn_mapping, sentences, sentences_instances, X_, y_pred
    print("Prediction complete!")


def predict_wordnet_domains(input_path : str, output_path : str, resources_path : str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <wordnetDomain>" format (e.g. "d000.s000.t000 sport").

    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.

    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """
    if (not resources_path.endswith("/")):
        resources_path = resources_path+"/"

    input_folder_path = input_path
    corpora_xml_path = input_path
    
    if (input_path.endswith(".xml")):
        input_folder_path = "/".join(input_path.split("/")[0:-1])

    if (not input_folder_path.endswith("/")):
        input_folder_path = input_folder_path+"/"

    if (os.path.isfile(output_path)):
        pred_file = output_path
        output_folder_path = "/".join(output_path.split("/")[0:-1])+"/"
    elif (os.path.isdir(output_path)):
        if (not output_path.endswith("/")):
            output_folder_path = output_path+"/"
        pred_file = output_folder_path+"pred_domains.txt"

    model_name = resources_path+"mult2_model.hdf5"
    print("LOADING RESOURCES...")
    model = load_model(model_name)

    #load the saved vocabularies
    with open(resources_path+"mult2_x_vocab.txt", 'r') as file:
        x_vocab = file.read()
    x_vocab = json.loads(x_vocab)

    with open(resources_path+"mult2_y_dom_vocab.txt", 'r') as file:
        y_vocab = file.read()
    y_vocab = json.loads(y_vocab)
    id_to_words = {v:k for k, v in y_vocab.items()}

    bn2wn_mapping = load_bn2wn_mapping(resources_path+"babelnet2wordnet.tsv", True)
    bn2dom_mapping = load_bn2wn_mapping(resources_path+"babelnet2wndomains.tsv")

    #preparing the input data for prediction
    print("PREPARING EVALUATION DATA FOR PREDICTION...")
    extract_eval_data(corpora_xml_path, resources_path)

    sentences = load_test_dataset(input_folder_path+"sentences.txt")
    X_ = make_X(sentences, x_vocab)

    sentences_instances = load_sentence_instances(input_folder_path+"inst_temp_file.txt")

    #predicting and writing to file
    print("Predicting (line by line) and writing to file... This may take a little while...")
    k = 0
    inst_index = 0
    x_len = X_.shape
    with open(pred_file, "w") as file:
        for x in X_:
            if x.size != 0:
                x__ = np.expand_dims(x, axis=0)
                y_pred = model.predict(x__)
                y_pred = y_pred[1]
                
                # This loop is meant to handle one instance at a time, saved temporarily in the inst_temp_file.txt file,
                # loaded into the sentences_instances variable, till there's no instance left
                while True:
                    assoc_bn_synsets_vocab_pos = []
                    if inst_index not in sentences_instances:
                        break

                    inst = sentences_instances[inst_index]
                    if (int(inst[2]) != k):
                        break
                    else:
                        inst_index += 1
                        inst_pos_in_sent = int(inst[3])
                        inst_id = inst[1]

                        # Getting associated senses to the lemma of the instance selected
                        inst_synsets = wn.synsets(inst[0])
                        for wn_synset in inst_synsets:
                            wn_synset_id = "wn:" + str(wn_synset.offset()).zfill(8) + wn_synset.pos()
                            
                            if wn_synset_id in bn2wn_mapping:
                                bn_id = bn2wn_mapping[wn_synset_id]
                                if bn_id in bn2dom_mapping:
                                    dom_name = "dom:"+bn2dom_mapping[bn_id]
                                
                                    if dom_name in y_vocab:
                                        assoc_bn_synsets_vocab_pos.append(y_vocab[dom_name])
                        
                        # Finding argmax over all associated synsets, and defaulting to MFS (pre saved to the vocab) where there's none
                        if assoc_bn_synsets_vocab_pos:
                            pred_word = y_pred[0, inst_pos_in_sent]
                            synset_probs = []
                            for pos in assoc_bn_synsets_vocab_pos:
                                synset_probs.append(pred_word[pos])

                            pred_dom = id_to_words[assoc_bn_synsets_vocab_pos[np.argmax(synset_probs)]]
                        else:
                            #MFS word = inst[0]
                            pred_sense = bn2wn_mapping[wn_mfs(inst[0])]
                            if pred_sense in bn2dom_mapping:
                                pred_dom = bn2dom_mapping[pred_sense]
                            else:
                                pred_dom = "factotum"

                        file.write("{} {}\n".format(inst_id, pred_dom[4:]))
            
            k = k+1
            if k % 100 < 1:
                print ("%d/%d lines done... A little moment more and everything will be done! :)" % (k,x_len[0]));

    del model, x_vocab, y_vocab, id_to_words, bn2wn_mapping, sentences, sentences_instances, X_, y_pred
    print("Prediction complete!")


def predict_lexicographer(input_path : str, output_path : str, resources_path : str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <lexicographerId>" format (e.g. "d000.s000.t000 noun.animal").

    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.

    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """
    if (not resources_path.endswith("/")):
        resources_path = resources_path+"/"

    input_folder_path = input_path
    corpora_xml_path = input_path
    
    if (input_path.endswith(".xml")):
        input_folder_path = "/".join(input_path.split("/")[0:-1])

    if (not input_folder_path.endswith("/")):
        input_folder_path = input_folder_path+"/"

    if (os.path.isfile(output_path)):
        pred_file = output_path
        output_folder_path = "/".join(output_path.split("/")[0:-1])+"/"
    elif (os.path.isdir(output_path)):
        if (not output_path.endswith("/")):
            output_folder_path = output_path+"/"
        pred_file = output_folder_path+"pred_lex.txt"

    model_name = resources_path+"mult2_model.hdf5"
    print("LOADING RESOURCES...")
    model = load_model(model_name)

    #load the saved vocabularies
    with open(resources_path+"mult2_x_vocab.txt", 'r') as file:
        x_vocab = file.read()
    x_vocab = json.loads(x_vocab)

    with open(resources_path+"mult2_y_lex_vocab.txt", 'r') as file:
        y_vocab = file.read()
    y_vocab = json.loads(y_vocab)
    id_to_words = {v:k for k, v in y_vocab.items()}

    bn2wn_mapping = load_bn2wn_mapping(resources_path+"babelnet2wordnet.tsv", True)
    bn2lex_mapping = load_bn2wn_mapping(resources_path+"babelnet2lexnames.tsv")

    #preparing the input data for prediction
    print("PREPARING EVALUATION DATA FOR PREDICTION...")
    extract_eval_data(corpora_xml_path, resources_path)

    sentences = load_test_dataset(input_folder_path+"sentences.txt")
    X_ = make_X(sentences, x_vocab)

    sentences_instances = load_sentence_instances(input_folder_path+"inst_temp_file.txt")

    #predicting and writing to file
    print("Predicting (line by line) and writing to file... This may take a little while...")
    k = 0
    inst_index = 0
    x_len = X_.shape
    with open(pred_file, "w") as file:
        for x in X_:
            if x.size != 0:
                x__ = np.expand_dims(x, axis=0)
                y_pred = model.predict(x__)
                y_pred = y_pred[2]

                # This loop is meant to handle one instance at a time, saved temporarily in the inst_temp_file.txt file,
                # loaded into the sentences_instances variable, till there's no instance left
                while True:
                    assoc_bn_synsets_vocab_pos = []
                    if inst_index not in sentences_instances:
                        break

                    inst = sentences_instances[inst_index]
                    if (int(inst[2]) != k):
                        break
                    else:
                        inst_index += 1
                        inst_pos_in_sent = int(inst[3])
                        inst_id = inst[1]

                        # Getting associated senses to the lemma of the instance selected
                        inst_synsets = wn.synsets(inst[0])
                        for wn_synset in inst_synsets:
                            wn_synset_id = "wn:" + str(wn_synset.offset()).zfill(8) + wn_synset.pos()
                            
                            if wn_synset_id in bn2wn_mapping:
                                bn_id = bn2wn_mapping[wn_synset_id]
                                if bn_id in bn2lex_mapping:
                                    lex_name = "lex:"+bn2lex_mapping[bn_id]
                                
                                    if lex_name in y_vocab:
                                        assoc_bn_synsets_vocab_pos.append(y_vocab[lex_name])
                                      
#                             if wn_synset_id in bn2wn_mapping and bn2wn_mapping[wn_synset_id] in y_vocab:
#                                 assoc_bn_synsets_vocab_pos.append(y_vocab[bn2wn_mapping[wn_synset_id]])
                        
                        # Finding argmax over all associated synsets, and defaulting to MFS (pre saved to the vocab) where there's none
                        if assoc_bn_synsets_vocab_pos:
                            pred_word = y_pred[0, inst_pos_in_sent]
                            synset_probs = []
                            for pos in assoc_bn_synsets_vocab_pos:
                                synset_probs.append(pred_word[pos])

                            pred_lex = id_to_words[assoc_bn_synsets_vocab_pos[np.argmax(synset_probs)]]
                            
                        else:
                            #MFS word = inst[0]
                            pred_sense = bn2wn_mapping[wn_mfs(inst[0])]
                            if pred_sense in bn2lex_mapping:
                                pred_lex = bn2lex_mapping[pred_sense]
                            else:
                                pred_lex = "adj.all"

                        file.write("{} {}\n".format(inst_id, pred_lex[4:]))
            
            k = k+1
            if k % 100 < 1:
                print ("%d/%d lines done... A little moment more and everything will be done! :)" % (k,x_len[0]));

    del model, x_vocab, y_vocab, id_to_words, bn2wn_mapping, sentences, sentences_instances, X_, y_pred
    print("Prediction complete!")

In [0]:
def run_tests1(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
        
    print("PREDICTING FOR SE2...")
    se2_scores = ['SE2']

    input_path = '../data/Evaluation_Datasets/senseval2/senseval2.data.xml'
    output_path = '../data/Evaluation_Datasets/senseval2'
    gold_file =  '../data/Evaluation_Datasets/senseval2/senseval2.gold.key.txt'

    pred_babelnet = '../data/Evaluation_Datasets/senseval2/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/senseval2/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/senseval2/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se2_scores.append(score*100)

    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se2_scores.append(score*100)

    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se2_scores.append(score*100)

    all_scores.append(se2_scores)
    

        
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))



def run_tests2(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
    

    print("\n\nPREDICTING FOR SE3...")
    se3_scores = ['SE3_(Dev)']

    input_path = '../data/Evaluation_Datasets/senseval3/senseval3.data.xml'
    output_path = '../data/Evaluation_Datasets/senseval3'
    gold_file =  '../data/Evaluation_Datasets/senseval3/senseval3.gold.key.txt'

    pred_babelnet = '../data/Evaluation_Datasets/senseval3/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/senseval3/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/senseval3/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se3_scores.append(score*100)

    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se3_scores.append(score*100)

    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se3_scores.append(score*100)

    all_scores.append(se3_scores)
    
    
    ###########################################################
        
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))



def run_tests3(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
                
    
    print("\n\nPREDICTING FOR SE07...")
    se07_scores = ['SE07']
    
    input_path = '../data/Evaluation_Datasets/semeval2007/semeval2007.data.xml'
    output_path = '../data/Evaluation_Datasets/semeval2007'
    gold_file =  '../data/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/semeval2007/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/semeval2007/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/semeval2007/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se07_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se07_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se07_scores.append(score*100)
    
    all_scores.append(se07_scores)
    
    #########################################################
    
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))
            
            

            
def run_tests4(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))

    
    print("\n\nPREDICTING FOR SE13...")
    se13_scores = ['SE13']
    
    input_path = '../data/Evaluation_Datasets/semeval2013/semeval2013.data.xml'
    output_path = '../data/Evaluation_Datasets/semeval2013'
    gold_file =  '../data/Evaluation_Datasets/semeval2013/semeval2013.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/semeval2013/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/semeval2013/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/semeval2013/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se13_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se13_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se13_scores.append(score*100)
    
    all_scores.append(se13_scores)
    
    #########################################################
    
    
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))



    
    

def run_tests5(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
                
    
    print("\n\nPREDICTING FOR SE15...")
    se15_scores = ['SE15']
    
    input_path = '../data/Evaluation_Datasets/semeval2015/semeval2015.data.xml'
    output_path = '../data/Evaluation_Datasets/semeval2015'
    gold_file =  '../data/Evaluation_Datasets/semeval2015/semeval2015.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/semeval2015/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/semeval2015/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/semeval2015/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se15_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se15_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se15_scores.append(score*100)
    
    all_scores.append(se15_scores)
    
    
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))





def run_tests6(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
    
    
    print("\n\nPREDICTING FOR ALL...")
    se_ALL_scores = ['ALL']
    
    input_path = '../data/Evaluation_Datasets/ALL/ALL.data.xml'
    output_path = '../data/Evaluation_Datasets/ALL'
    gold_file =  '../data/Evaluation_Datasets/ALL/ALL.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/ALL/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/ALL/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/ALL/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se_ALL_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se_ALL_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se_ALL_scores.append(score*100)
    
    all_scores.append(se_ALL_scores)
    
    
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))

In [11]:
#run_tests1("../resources/scores.csv")
#run_tests2("../resources/scores.csv")
#run_tests3("../resources/scores.csv")
#run_tests4("../resources/scores.csv")
#run_tests5("../resources/scores.csv")
run_tests6("../resources/scores.csv")



PREDICTING FOR ALL...
LOADING RESOURCES...
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
PREPARING EVALUATION DATA FOR PREDICTION...
1,173 sentences extracted...
Evaluation data extraction completed
Predicting (line by line) and writing to file... This may take a little while...
100/1173 lines done... A little moment more and everything will be done! :)
200/

##Results

In [12]:
all_scores_pd = pd.read_csv('../resources/scores.csv', names=['Babelnet', 'Lex', 'Domain'])
all_scores_pd

Unnamed: 0,Babelnet,Lex,Domain
SE2,60.04,78.79,63.94
SE3_(Dev),62.59,76.7,69.46
SE07,53.63,66.15,86.37
SE13,58.64,68.92,73.24
SE15,54.99,71.43,62.52
ALL,59.26,74.19,68.66
