## Mount Drive and Imports

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd /content/drive/My Drive/Colab Notebooks/nlp_hw3/code

/content/drive/My Drive/Colab Notebooks/nlp_hw3/code


In [0]:
import os
import numpy as np
from typing import Tuple, List, Dict

import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import load_model

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import TensorBoard
from tensorflow.python.eager import context
import json
import pandas as pd
from nltk.corpus import stopwords

import my_utils as utils
import corpora

##Preprocessing

In [0]:
def load_train_dataset(input_path: str, y_path: str) -> Tuple[List[str], List[str]]:
    """
    :param input_path; Path to the input dataset
    :param label_path; Path to the file containing the corresponding labels for the input dataset
    :return sentences; List of sentences in input_file
    :return labels; List of corresponding word segment codes in label_path. Same len as sentences
    """
    sentences = []
    k = 0
    with open(input_path, "r", encoding="utf-8-sig") as file:
        for line in file:
            k += 1
            sentences.append(line.strip())
#             if (k >= 4000):
#                 break

    y = []
    k = 0
    with open(y_path, "r", encoding="utf-8-sig") as file:
        for line in file:
            k += 1
            y.append(line.strip())
#             if (k >= 4000):
#                 break

    return sentences, y


def make_X_vocab(sentences: List[str]) -> Dict[str, int]:
    '''
    :param sentences; List of input sentences from the dataset
    :return unigrams_vocab; Dictionary from unigram to int
    :return bigrams_vocab; Dictionary from bigram to int
    '''
    vocab = {"UNK": 0}

    for sentence in sentences:
        for word in sentence.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    
    return vocab


def make_Y_vocab(y: List[str], min_count=3) -> Dict[str, int]:
    """
    :param labels; List of label codes
    :return labels_vocab; Dictionary from label code to int 
    """
    y_vocab = {"UNK": 0, "OTHERS": 1}
    words_freq = {}
    
    for y_line in y:
        for y_word in y_line.split():
            if "bn:" in y_word:
                if y_word not in y_vocab:
                    y_vocab[y_word] = len(y_vocab)
            else:
                if y_word not in words_freq:
                    words_freq[y_word] = 1
                else:
                    words_freq[y_word] += 1
    
    stpwrds = list(set(stopwords.words('english')))
    for k, v in words_freq.items():
        if v >= min_count and v not in stpwrds:
            y_vocab[k] = len(y_vocab)
    
    return y_vocab



def make_Y(output: List[str], output_vocab: Dict[str, int]) -> np.ndarray:
    """
    :param labels; List of word segment codes, line by line
    :param labels_vocab; Label codes vocab
    :return y; Vector of label code indices
    """
    y = []
    for output_line in output:
        y_temp = []
        for single_output in output_line.split():
            if single_output in output_vocab:
                y_temp.append( output_vocab[single_output])
            else:
                y_temp.append( output_vocab["OTHERS"])
        y.append(np.array(y_temp))
    
    return np.array(y)

In [0]:
train_data_path = '../data/Training_Corpora/semcor'
test_data_path = '../data/Evaluation_Datasets/senseval3'

#Parse training data
corpora_xml_path = train_data_path + '/semcor.data.xml'
gold_mapping_path = train_data_path + '/semcor.gold.key.txt'
resources_path = '../resources/'

corpora.extract_training_data(corpora_xml_path, gold_mapping_path, resources_path, train_data_path)

In [0]:
#Parse validation data
corpora_xml_path = test_data_path + '/senseval3.data.xml'
gold_mapping_path = test_data_path + '/senseval3.gold.key.txt'
resources_path = '../resources/'

corpora.extract_training_data(corpora_xml_path, gold_mapping_path, resources_path, test_data_path)

In [0]:
sentences, y = load_train_dataset(train_data_path+"/trainX.txt", train_data_path+"/trainy.txt")
test_sentences, test_y = load_train_dataset(test_data_path+"/trainX.txt", test_data_path+"/trainy.txt")

In [0]:
print(len(sentences))
print(sentences[0])

37168
How long has it been since you reviewed the objectives of your benefit and service program


In [0]:
print(len(y))
print(y[0])

37168
how bn:00106124a have it bn:00083181v since you bn:00092618v the bn:00002179n of you bn:00009904n and bn:00070654n bn:00064646n


In [0]:
vocab = make_X_vocab(sentences + test_sentences)
print(len(vocab))

48989


In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
y_vocab = make_Y_vocab(y + test_y)
print(len(y_vocab))

34505


In [0]:
X = utils.make_X(sentences, vocab)
y_array = make_Y(y, y_vocab)

X_test = utils.make_X(test_sentences, vocab)
y_test_array = make_Y(test_y, y_vocab)

In [0]:
print(X.shape)
print(y_array.shape)

(37168,)
(37168,)


In [0]:
train_x = pad_sequences(X, truncating='pre', padding='post', maxlen=30)
train_y = pad_sequences(y_array, truncating='pre', padding='post', maxlen=30)

dev_x = pad_sequences(X_test, truncating='pre', padding='post', maxlen=30)
dev_y = pad_sequences(y_test_array, truncating='pre', padding='post', maxlen=30)

In [0]:
print(train_x.shape)
print(train_y.shape)

(37168, 30)
(37168, 30)


In [0]:
train_y = train_y.reshape((*train_y.shape, 1))
dev_y = dev_y.reshape((*dev_y.shape, 1))
print(train_y.shape)
print(dev_y.shape)

(37168, 30, 1)
(352, 30, 1)


In [0]:
print(train_x.shape)
print(train_y.shape)
print(dev_x.shape)
print(dev_y.shape)

(37168, 30)
(37168, 30, 1)
(352, 30)
(352, 30, 1)


##Building the model

In [0]:
vocab_size = len(vocab)

In [0]:
#This class helps with logging
class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./logs', **kwargs):
        self.val_log_dir = os.path.join(log_dir, 'model_C/validation')
        training_log_dir = os.path.join(log_dir, 'model_C/training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

    def set_model(self, model):
        if context.executing_eagerly():
            self.val_writer = tf.contrib.summary.create_file_writer(self.val_log_dir)
        else:
            self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def _write_custom_summaries(self, step, logs=None):
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if 'val_' in k}
        if context.executing_eagerly():
            with self.val_writer.as_default(), tf.contrib.summary.always_record_summaries():
                for name, value in val_logs.items():
                    tf.contrib.summary.scalar(name, value.item(), step=step)
        else:
            for name, value in val_logs.items():
                summary = tf.Summary()
                summary_value = summary.value.add()
                summary_value.simple_value = value.item()
                summary_value.tag = name
                self.val_writer.add_summary(summary, step)
        self.val_writer.flush()

        logs = {k: v for k, v in logs.items() if not 'val_' in k}
        super(TrainValTensorBoard, self)._write_custom_summaries(step, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()

In [0]:
#Please take note that most of this part was extracted from class exercises, with some additions

def create_keras_model(vocab_size, y_size, embedding_size=128, hidden_size=512):
    print("Creating KERAS model")
    
    model = K.models.Sequential()
    model.add(Embedding(vocab_size, embedding_size, mask_zero=True))
    
    model.add(Bidirectional(LSTM(hidden_size, dropout=0.3, recurrent_dropout=0.3, return_sequences=True), merge_mode='concat'))
    model.add(Bidirectional(LSTM(hidden_size, dropout=0.3, recurrent_dropout=0.3, return_sequences=True), merge_mode='concat'))
    
    model.add(TimeDistributed(Dense(y_size, activation='softmax')))
    optimizer = K.optimizers.Adam()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

    return model
    

In [0]:
resource_path = "../resources/"

In [0]:
batch_size = 64
epochs = 20
model_name = resource_path+"model.hdf5"

#checks if the FINAL model was saved and loads it instead of creating a new one
if os.path.exists(model_name):
    model = load_model(model_name)
    print("Using a pre-saved model")
    model.summary()
    
else:
    model = create_keras_model(vocab_size, len(y_vocab))
    print("Training a new model")
    model.summary()
    
    filepath = resource_path+"models/model_C.hdf5"
    checkpoint = K.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
    #callbacks_list = [checkpoint]
    tb_callback = tf.keras.callbacks.TensorBoard(log_dir="logs", histogram_freq=0, write_graph=False, write_images=True)
    callbacks_list = [TrainValTensorBoard(write_graph=False), checkpoint]
    
    print("\nStarting training...")
    model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size,
              shuffle=True, validation_data=(dev_x, dev_y), callbacks=callbacks_list) 
    print("Training complete.\n")
    
    #Save the FINAL model for later reuse
    model.save(model_name)
    print("Trained model saved for later use")

    print("\nEvaluating test...")
    loss_acc = model.evaluate(dev_x, dev_y, verbose=0)
    print("Test data: loss = %0.6f  accuracy = %0.2f%% " % (loss_acc[0], loss_acc[1]*100))

Creating KERAS model
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Training a new model
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         6270592   
________________________________________________________

In [0]:
#Writing the vocabularies to file

with open(resource_path+"x_vocab.txt", "w") as file:
    file.write(json.dumps(vocab))
    
with open(resource_path+"y_vocab.txt", "w") as file:
    file.write(json.dumps(y_vocab))

##Running Predictions

In [0]:
from predict import *
from score import *

In [0]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
def run_tests1(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
        
    print("PREDICTING FOR SE2...")
    se2_scores = ['SE2']

    input_path = '../data/Evaluation_Datasets/senseval2/senseval2.data.xml'
    output_path = '../data/Evaluation_Datasets/senseval2'
    gold_file =  '../data/Evaluation_Datasets/senseval2/senseval2.gold.key.txt'

    pred_babelnet = '../data/Evaluation_Datasets/senseval2/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/senseval2/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/senseval2/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se2_scores.append(score*100)

    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se2_scores.append(score*100)

    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se2_scores.append(score*100)

    all_scores.append(se2_scores)
    

    #########################################################

    print("\n\nPREDICTING FOR SE3...")
    se3_scores = ['SE3_(Dev)']

    input_path = '../data/Evaluation_Datasets/senseval3/senseval3.data.xml'
    output_path = '../data/Evaluation_Datasets/senseval3'
    gold_file =  '../data/Evaluation_Datasets/senseval3/senseval3.gold.key.txt'

    pred_babelnet = '../data/Evaluation_Datasets/senseval3/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/senseval3/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/senseval3/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se3_scores.append(score*100)

    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se3_scores.append(score*100)

    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se3_scores.append(score*100)

    all_scores.append(se3_scores)
    
    
    ###########################################################
        
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))



def run_tests2(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
                
    
    print("\n\nPREDICTING FOR SE07...")
    se07_scores = ['SE07']
    
    input_path = '../data/Evaluation_Datasets/semeval2007/semeval2007.data.xml'
    output_path = '../data/Evaluation_Datasets/semeval2007'
    gold_file =  '../data/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/semeval2007/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/semeval2007/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/semeval2007/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se07_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se07_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se07_scores.append(score*100)
    
    all_scores.append(se07_scores)
    
    #########################################################
    
    print("\n\nPREDICTING FOR SE13...")
    se13_scores = ['SE13']
    
    input_path = '../data/Evaluation_Datasets/semeval2013/semeval2013.data.xml'
    output_path = '../data/Evaluation_Datasets/semeval2013'
    gold_file =  '../data/Evaluation_Datasets/semeval2013/semeval2013.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/semeval2013/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/semeval2013/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/semeval2013/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se13_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se13_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se13_scores.append(score*100)
    
    all_scores.append(se13_scores)
    
    #########################################################
    
    
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))
            
            
            
    
    

def run_tests3(score_file):
    
    resources_path = '../resources'
    bn2wn_mapping_file = '../resources/babelnet2wordnet.tsv'
    
    all_scores = []
    if (os.path.isfile(score_file)):
        with open(score_file, 'r') as file:
            for line in file:
                all_scores.append(line.split(","))
                
    
    print("\n\nPREDICTING FOR SE15...")
    se15_scores = ['SE15']
    
    input_path = '../data/Evaluation_Datasets/semeval2015/semeval2015.data.xml'
    output_path = '../data/Evaluation_Datasets/semeval2015'
    gold_file =  '../data/Evaluation_Datasets/semeval2015/semeval2015.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/semeval2015/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/semeval2015/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/semeval2015/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se15_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se15_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se15_scores.append(score*100)
    
    all_scores.append(se15_scores)
    
    
    #####################################################################
    
    
    print("\n\nPREDICTING FOR ALL...")
    se_ALL_scores = ['ALL']
    
    input_path = '../data/Evaluation_Datasets/ALL/ALL.data.xml'
    output_path = '../data/Evaluation_Datasets/ALL'
    gold_file =  '../data/Evaluation_Datasets/ALL/ALL.gold.key.txt'
    
    pred_babelnet = '../data/Evaluation_Datasets/ALL/pred_babelnet.txt'
    pred_lex = '../data/Evaluation_Datasets/ALL/pred_lex.txt'
    pred_domains = '../data/Evaluation_Datasets/ALL/pred_domains.txt'

    predict_babelnet(input_path, output_path, resources_path)
    score = score_predict1(pred_babelnet, gold_file, bn2wn_mapping_file)
    se_ALL_scores.append(score*100)
    
    predict_lexicographer(input_path, output_path, resources_path)
    score = score_predict_lex(pred_lex, gold_file, resources_path)
    se_ALL_scores.append(score*100)
    
    predict_wordnet_domains(input_path, output_path, resources_path)
    score = score_predict_dom(pred_domains, gold_file, resources_path)
    se_ALL_scores.append(score*100)
    
    all_scores.append(se_ALL_scores)
    
    
    with open(score_file, "w") as file:
        for scores in all_scores:
            file.write("{},{:.2f},{:.2f},{:.2f}\n".format(scores[0], float(scores[1]), float(scores[2]), float(scores[3])))

In [0]:
#all_scores_pd = run_tests1("../resources/scores.csv")
#all_scores_pd = run_tests2("../resources/scores.csv")
all_scores_pd = run_tests3("../resources/scores.csv")



PREDICTING FOR SE15...
LOADING RESOURCES...
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
PREPARING EVALUATION DATA FOR PREDICTION...
138 sentences extracted...
Evaluation data extraction completed
Predicting (line by line) and writing to file... This may take a little while...
100/138 lines done... A little moment more and everything will be done! :)
Predic

##Results

In [0]:
all_scores_pd = pd.read_csv('../resources/scores.csv', names=['Babelnet', 'Lex', 'Domain'])
all_scores_pd

Unnamed: 0,Babelnet,Lex,Domain
SE2,62.01,78.83,88.17
SE3_(Dev),63.95,77.3,86.7
SE07,54.51,70.77,87.47
SE13,58.64,70.8,73.97
SE15,56.95,72.21,80.33
ALL,60.55,75.18,83.43
