#### Word-In-Context Disambiguation


author: Simone Rossetti 499831

In [None]:
# dataset
import jsonlines
import pandas as pd
# general
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from typing import *
import numpy as np
np.random.seed(41296)
import collections
import itertools
import re
from copy import copy
from sklearn.decomposition import PCA
from random import seed
from random import random, randint
seed(41296)
# torch
import torch
from torch import nn
torch.manual_seed(41296)
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm.notebook import tqdm
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# nltk 
import nltk
from nltk.chunk import RegexpParser
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

The universal tagset consists of the following 12 coarse tags:


- VERB - verbs (all tenses and modes)
- NOUN - nouns (common and proper)
- PRON - pronouns
- ADJ - adjectives
- ADV - adverbs
- ADP - adpositions (prepositions and postpositions)
- CONJ - conjunctions
- DET - determiners
- NUM - cardinal numbers
- PRT - particles or other function words
- X - other: foreign words, typos, abbreviations
- . - punctuation


In [None]:
# Download and unzip glove embeddings
! wget https://nlp.stanford.edu/data/glove.6B.zip
! gzip -d glove.6B.zip

In [None]:
# Download and load NLTK APIs used for preprocessing
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# load english stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))
# load english lemmatizer
lemmatizer = nltk.wordnet.WordNetLemmatizer()
# dict mapping from universal POS to nltk wordnet object
word_net_type_dict = {'ADJ': nltk.corpus.reader.wordnet.ADJ, \
                    'ADV': nltk.corpus.reader.wordnet.ADV, \
                    'NOUN': nltk.corpus.reader.wordnet.NOUN, \
                    'VERB': nltk.corpus.reader.wordnet.VERB
                    }
word_net_type_dict = defaultdict(lambda: 'NOUN', word_net_type_dict)
# define a simple grammar (extra exercise)
grammar = r"""
        NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
        PP: {<IN><NP>}               # Chunk prepositions followed by NP
        VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
        CLAUSE: {<NP><VP>}           # Chunk NP, VP
        """
# RegexpParser parse the sentence labeling parts with grammar defined labels
chunker = RegexpParser(grammar)

In [None]:
# load datasets
train_dataframe = pd.read_json('/home/fiorapirri/nlp2021-hw1/data/train.jsonl',lines=True)

print(f'train dataframe: {train_dataframe.shape}')

dev_dataframe = pd.read_json('/home/fiorapirri/nlp2021-hw1/data/dev.jsonl',lines=True)

print(f'dev dataframe: {dev_dataframe.shape}')

In [None]:
# these tokens are special positions in our embedding tensor
RESERVED_TOKENS = {'<pad>': 0, # padding
                    '<unk>': 1, # unknown
                    '<sep>': 2, # sentences separation
                    '<drop>': 3, # dropped word
                    '<numb>': 4, # number
                    '<punct>': 5} # punctuation

In [None]:
# here we define NE labels for some further experiments
NER_LABELS = ['ORGANIZATION', 'PERSON', 'GSP', 'GPE', \
        'LOCATION', 'FACILITY'] 

In [None]:
# Here we define POS+NE tags wich we will use for some further experiments
POS_NE_TAGS = {
            'NOUN': 0,
            'VERB': 1,
            'ADJ': 2,
            'ADV': 3,
            'GPE': 4,
            'ORGANIZATION': 5,
            'PERSON': 6,
            'LOCATION': 7,
            'FACILITY': 8,
            'GSP': 9,
            '<pad>': 10,
            '<drop>': 11,
            '<sep>': 12
            }

In [None]:
def load_embeddings(file_path: str, words_limit: int = 100_000) -> dict():
    """
    Load and returns embeddings in a dict from a file path
    """
    word_vectors = dict()
    dim = 0
    with open(file_path) as f:
        for i, line in tqdm(enumerate(f), total=words_limit):
            if i == words_limit:
                break
            # split word (first component) and subsequent values
            word, *vector = line.strip().split(' ') 
            vector = torch.tensor([float(c) for c in vector])
            if i == 0:
                dim = vector.size(0)
            if vector.size(0) == dim:
                # fill the dictionary
                word_vectors[word] = vector
    return word_vectors

In [None]:
def load_index_and_weights(file_path: str, reserved_tokens: dict = RESERVED_TOKENS,\
    words_limit: int = 100_000) -> (defaultdict, torch.Tensor):
    """
    Returns words index mapping and weights
    """
    word_vectors = load_embeddings(file_path, words_limit)
    word_index = dict()
    vectors_store = []

    for word, vector in word_vectors.items():
        word_index[word] = len(vectors_store) + len(RESERVED_TOKENS)
        vectors_store.append(vector)
  
    # compute mean and std of vectors_store in order to
    # reduce input noise creating random new vectors for 
    # reserved tokens
    vectors_store = torch.stack(vectors_store)
    mean = vectors_store.mean()
    std = vectors_store.std()
    dim = vectors_store.shape[1]

    # assert reserved keys are not in vocabulary yet
    for key in RESERVED_TOKENS:
        assert key not in word_index    

    reserved_tokens = []
    # sample new tokens with normal distribution N(mean,std)
    for key in RESERVED_TOKENS:
        if key == '<drop>':
            reserved_tokens.append(torch.zeros((dim,)))
        else:
            reserved_tokens.append(torch.normal(mean, std, size=(dim,)))
        word_index[key] = RESERVED_TOKENS[key]

    reserved_tokens = torch.stack(reserved_tokens)
    vectors_store = torch.cat([reserved_tokens,vectors_store], dim=0)
    # default dict returns 1 (unk token) when unknown word
    word_index = defaultdict(lambda: RESERVED_TOKENS['<unk>'], word_index)
    
    
    return word_index, vectors_store

In [None]:
# here we load GloVe word index and embeddings trained on Wikipedia 2014 
word_index, vectors_store = load_index_and_weights('glove.6B.300d.txt', words_limit = 400_000)

In [None]:
# similiar words are very near each other, we can use cosine similarity
# to visualize this distance in range [0,1]
def cosine_similarity(v1: torch.Tensor, v2: torch.Tensor) -> float:
    num = torch.sum(v1 * v2)
    den = torch.linalg.norm(v1) * torch.linalg.norm(v2)
    return (num / den).item()

In [None]:
# words 'united' and 'states' are very near each other
cosine_similarity(vectors_store[word_index['united']],vectors_store[word_index['states']])

In [None]:
# let's see where some lemma are projected into the plane by PCA
# retrieve the trained embeddings
embeddings = vectors_store

# pick some words to visualise
words = list(set(list(train_dataframe['lemma'])))

# perform PCA to reduce our 300d embeddings to 2d points that can be plotted
pca = PCA(n_components=2)
pca_result = pca.fit_transform(embeddings.detach().cpu())

indexes = [word_index[x] for x in words[:100]]
points = [pca_result[i] for i in indexes]
for i,(x,y) in enumerate(points):
    plt.plot(x, y, 'ro')
    plt.text(x, y, words[i], fontsize=12) # add a point label, shifted wrt to the point
plt.title('2D PCA decomposition of embeddings')
plt.show()

In [None]:
def parse_chunks(n):
    '''
    This function extract from a chunked tree the NP, VP and CLAUSEs
    '''
    chunks = []
    if isinstance(n, nltk.tree.Tree):  
        if n.label() == 'NP' or n.label() == 'VP' or n.label() == 'CLAUSE':
            chunks.extend(n.flatten())
        else:
            for l in n:
                if isinstance(l, nltk.tree.Tree):
                    chunks.extend(parse_chunks(l))    
    return chunks

def get_ne(n):
    '''
    This function extract NE labels from a tree:
        ORGANIZATION 	Georgia-Pacific Corp., WHO
        PERSON 	Eddy Bonte, President Obama
        LOCATION 	Murray River, Mount Everest
        DATE 	June, 2008-06-29
        TIME 	two fifty a m, 1:30 p.m.
        MONEY 	175 million Canadian Dollars, GBP 10.40
        PERCENT 	twenty pct, 18.75 %
        FACILITY 	Washington Monument, Stonehenge
        GPE         South East Asia, Midlothian
        GSP
    '''
    ne = []
    if isinstance(n, nltk.tree.Tree):
        for l in n:
            if isinstance(l, nltk.tree.Tree): 
                if l.label() in NER_LABELS:
                    for ll in l.leaves(): 
                        ne.append((ll[0],l.label()))
                else:
                    ne.append(l.flatten())
            else:
                ne.append(l)
    return ne


In [None]:
def get_words(text: str) -> List[Tuple[str, str]]:
    '''
    Perform the canonical parsing of a sentence:
    1. TOKENIZATION (- STOPWORDS)
    2. POS TAGGING (UNIVERSAL POS)
    3. LEMMATIZATION
    Returns:
        - words: processed words
        - tags: relative POS tags
    '''
    parsed = []
    text = str(text)
    # Clean the text
    text = re.sub(r"[^A-Za-z0–9^,!.\/’+-=]", " ", text)
    text = re.sub(r"(\d+)\,(\d+)", r"\g<1>\g<2>", text)
    text = re.sub(r"(\d+)\.(\d+)", r"\g<1>\g<2>", text)
    text = re.sub(r"(\d+)\–(\d+)", r"\g<1> \g<2>", text)
    text = re.sub(r"what’s", " what is ", text)
    text = re.sub(r"What’s", " What is ", text)
    text = re.sub(r"\’s", " ", text)
    text = re.sub(r"\’ve", " have ", text)
    text = re.sub(r"can’t", " can not ", text)
    text = re.sub(r"Can’t", " Can not ", text)
    text = re.sub(r"won't", " will not ", text)
    text = re.sub(r"Won't", " will not ", text)
    text = re.sub(r"n’t", " not ", text)
    text = re.sub(r"i’m", " i am ", text)
    text = re.sub(r"I’m", " I am ", text)
    text = re.sub(r"\’re", " are ", text)
    text = re.sub(r"\’d", " would ", text)
    text = re.sub(r"\’ll", " will ", text)
    text = re.sub(r"\;", " ", text)
    text = re.sub(r"\,", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"\!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\°", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\—", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"\’", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r"\:", " ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" E g ", " eg ", text)
    text = re.sub(r" U S ", " american ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r"e mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"(?<=\d)(st|nd|rd|th)\b", "", text)
    text = re.sub(r"(\d+)([A-Za-z]+)", r"\g<1> \g<2>", text)
    text = re.sub(r"([A-Za-z]+)(\d+)", r"\g<1> \g<2>", text)
    chunked = []
    for tokenized in sent_tokenize(text, language='english'):
        # Word tokenizers is used to find the words and punctuation in a string
        words_list = nltk.word_tokenize(tokenized)
        # Using a POS Tagger
        words_pos = nltk.pos_tag(words_list)
        # words_pos = parse_chunks(chunker.parse(words_pos)) # extra
        # label NE entities which do not contribute much to the context
        # words_pos = get_ne(nltk.chunk.ne_chunk(words_pos)) # extra
        # removing stop words from words_list and lower case all words
        words_pos = [(w.lower(),t) for (w,t) in words_pos if (not w.lower() in stop_words)]
        words_pos = [(w,nltk.tag.mapping.map_tag('en-ptb', 'universal', t)) if not t in NER_LABELS \
            else (w,t) for (w,t) in words_pos ]
        # lemmatize words: was -> be, lemmatizer needs pos tagging
        words_pos = [(lemmatizer.lemmatize(w, pos=word_net_type_dict[t]),t) \
          if t in list(word_net_type_dict.keys()) else (w,t) for (w,t) in words_pos]
        words_pos = [(w,t) for (w,t) in words_pos if t in POS_NE_TAGS and len(w) > 1]
        parsed.extend(words_pos)
    return parsed 
    

In [None]:
# let's see some examples
i=100
sentence = train_dataframe['sentence1']
lemma = train_dataframe['lemma']
pos = train_dataframe['pos']
start = train_dataframe['start1']
end = train_dataframe['end1']

words_tags_list = get_words(sentence[i])

print(f'SENTENCE:\n{sentence[i]}\n')
print(f'LEMMA:\n{lemma[i]}\n')
print(f'POS:\n{pos[i]}\n')
print(f'PARSING:\n{words_tags_list}\n')


In [None]:
def count_frequencies(sentences: List[str], embedding_dictionary: Optional[defaultdict] = None, vocab_size: int = 100_000, reserved_tokens: dict = RESERVED_TOKENS):
    """
    Defines the vocabulary to be used. Builds a mapping (word, index) for
    each word in the vocabulary.
    Returns:
        - word2index: dictionary with mapping words-index
        - word2freq: dictionary with frequencies for each word
    """
    counter = Counter()
    for s in sentences:
        counter.update([w for (w,t) in get_words(s)])
    counter_len = len(counter)
    print("Number of distinct words: {}".format(counter_len))
    # consider only the (vocab size -1) most common words to build the vocab
    dictionary = {key: index+len(RESERVED_TOKENS) for index, (key, _) \
        in enumerate(counter.most_common(vocab_size - 1)) if key not in RESERVED_TOKENS}
    for key in RESERVED_TOKENS:
        assert key not in dictionary    
    for key in RESERVED_TOKENS:
        dictionary[key] = RESERVED_TOKENS[key]
    word2index = defaultdict(lambda: RESERVED_TOKENS['<unk>'], dictionary)
    # dictionary with (word, frequency) pairs -- including only words that are in the vocabulary
    frequency = {x: counter[x] for x in dictionary if x not in RESERVED_TOKENS}
    word2freq = defaultdict(lambda: 0, frequency)
    tot_occurrences = sum(list(frequency.values()))
    print("Total occurrences of words in dictionary: {}".format(tot_occurrences))
    most_freq_word = max(frequency, key=counter.get)
    print("Most frequent word in dictionary appears {} times ({})".format(frequency[most_freq_word],
                                                                            most_freq_word))
    less_freq_word = min(frequency, key=counter.get)
    print("Less frequent word in dictionary appears {} times ({})".format(frequency[less_freq_word],
                                                                            less_freq_word))
    if embedding_dictionary != None:
        unk = len(Counter([w for w in dictionary if embedding_dictionary[w] == RESERVED_TOKENS['<unk>']]))
        print("Unknown words in embedding model are {}".format(unk))
    return word2index, word2freq

In [None]:
# let's see what is inside the parsed dataset
s1 = train_dataframe['sentence1'].copy()
s2 = train_dataframe['sentence2'].copy()
s1.append(s2, ignore_index=True)
word2index, word2freq = count_frequencies(s1, word_index)
print('MOST FREQUENT WORDS:\n')
Counter(word2freq).most_common(20)

In [None]:
def preprocess_sentence(sentence: str, lemma: str, pos: str, start: int, end:int) -> List[str]:
    """
    Map a WiC format sample to a the get_words function, 
    Returns:
        - (words, tags): list of processed words in the sentence, POS tagging of the sentence
        - index: position of the query lemma
    """
    word = sentence[start:end]
    words_tags_list = get_words(sentence[:start]) + [(lemma, pos)] + get_words(sentence[end:]) # process pre word sentence
    words = []
    tags = []
    for word, tag in words_tags_list:
        words.append(word)
        tags.append(tag)
    index = words.index(word)
    return words_tags_list, index

In [None]:
# what preprocess_sentence does?
words_tags_list, index = preprocess_sentence(sentence[i], lemma[i], pos[i], start[i], end[i])

print(f'SENTENCE:\n{sentence[i]}\n')
print(f'LEMMA:\n{lemma[i]}\n')
print(f'POS:\n{pos[i]}\n')
print(f'PARSING:\n{words_tags_list}\n')
print(f'QUERY LEMMA INDEX:\n{index}\n')

In [None]:
def keep_word(word: str, frequency: defaultdict, max_frequency: int) -> bool:
    '''
    Implements negative frequency sampling and returns true if we can keep the occurrence as training instance.
    '''
    p_keep = (1 - (frequency[word] / (max_frequency+10e-6)))*0.9+0.1
    return np.random.rand() < p_keep # toss a coin and compare it to p_keep to keep the word


In [None]:
def prune_most_frequent(words_tags, index, frequency):
    '''
    Uses keep_word to decide if keeping words, updates the index position of query lemma
    '''
    indices = list(range(len(words_tags)))
    max_frequency = max(frequency.values())
    saved = []
    for i in indices:
        if i == index:
            saved.append(i)
        elif keep_word(words_tags[i][0], frequency, max_frequency):
            saved.append(i)
    pruned_words_tags = [words_tags[i] for i in saved]
    index = [w for (w,t) in pruned_words_tags].index(words_tags[index][0])
    return pruned_words_tags, index

In [None]:
# what preprocess_sentence does?
words_tags_list, index = preprocess_sentence(sentence[i], lemma[i], pos[i], start[i], end[i])

print(f'LEMMA:\n{lemma[i]}\n')
print(f'SENTENCE:\n{sentence[i]}\n')
print(f'PARSING:\n{words_tags_list}\n')
print(f'INDEX:\n{index}\n')

words_tags_list, index = prune_most_frequent(words_tags_list, index, word2freq)

print(f'MOST FREQUENT PRUNING:\n{words_tags_list}\n')
print(f'INDEX:\n{index}\n')


In [None]:
def window_sentence(words_tags: List[Tuple[str,str]], index: int, window: int)-> Tuple[List[Tuple[str,str]], int]:
    '''
    Cut the sentence at window_size//2 around the query lemma if possible
    '''
    if len(words_tags) - index > window//2:
        first = max(0, index - window//2)
        last = min(len(words_tags), window + first)
    else:
        last = min(len(words_tags), index + window//2)
        first = max(0, last - window)
    words_tags = words_tags[first:last]
    index = index - first
    return words_tags, index

In [None]:
# what window_sentence does?
print(f'LEMMA:\n{lemma[i]}')
print(f'SENTENCE:\n{sentence[i]}')

words_tags_list, index = preprocess_sentence(sentence[i], lemma[i], pos[i], start[i], end[i])
print(f'WORDS/TAGS:\n{words_tags_list}')
print(f'INDEX:\n{index}')

words_tags_list, index  = window_sentence(words_tags_list, index, 3)
print(f'WORDS/TAGS:\n{words_tags_list}')
print(f'INDEX:\n{index}')


In [None]:
def sentence2indices(words_tags: List[Tuple[str,str]], dictionary: defaultdict) -> torch.Tensor:
    ''' Convert words to words indices '''
    return torch.tensor([dictionary[word] for word, tag in words_tags], dtype=torch.long)
    
def sentence2tagindices(words_tags: List[Tuple[str,str]]) -> torch.Tensor:
    ''' Convert tags to tags indices '''
    return torch.tensor([POS_NE_TAGS[tag] for word, tag in words_tags], dtype=torch.long)

def preprocess_sample(sentence1: str, sentence2: str, lemma: str, pos: str, label: str, \
    start1: int, end1: int, start2: int, end2: int, frequency: defaultdict, \
    dictionary: defaultdict, augment: bool = True) -> List[Tuple[torch.Tensor, torch.Tensor]]:
    '''
    Preprocess a sentence and return a tokenized sequence of word and tags indices
    '''
    label = torch.tensor([int(label.lower() == 'true')])
    words_tags1, index1 = preprocess_sentence(sentence1, lemma, pos, start1, end1)
    words_tags2, index2 = preprocess_sentence(sentence2, lemma, pos, start2, end2)
    
    samples = []
    if len(words_tags1)>1 and len(words_tags2)>1:
        if augment:
            words_tags1, index1 = prune_most_frequent(words_tags1, index1, frequency)  
            words_tags2, index2 = prune_most_frequent(words_tags2, index2, frequency)          
        
        words_tags1, index1  = window_sentence(words_tags1, index1, 20)
        words_tags2, index2  = window_sentence(words_tags2, index2, 20)
        words1 = sentence2indices(words_tags1, dictionary)
        words2 = sentence2indices(words_tags2, dictionary)
        indices = torch.tensor([index1, index2])
        tags1 = sentence2tagindices(words_tags1)
        tags2 = sentence2tagindices(words_tags2)
        samples.append((words1, words2, tags1, tags2, indices, label))
    return samples  


In [None]:
class WiCDataset(Dataset):
    '''
    This class implement the Word in Context Dataset
    '''

    def __init__(
        self,
        data: pd.DataFrame, # table (dataset)
        word_index: defaultdict,
        vectors_store: Optional[torch.Tensor] = None,
        train_set: Optional[bool] = True,
        sentence1_column: str = 'sentence1',
        sentence2_column: str = 'sentence2',
        label_column: str = 'label',
        lemma_column: str = 'lemma',
        start1_column: str = 'start1',
        end1_column: str = 'end1',
        start2_column: str = 'start2',
        end2_column: str = 'end2',
        pos_column: str = 'pos'
    ):
        self.train_set = train_set
        self.sentence1 = data[sentence1_column]
        self.sentence2 = data[sentence2_column]
        self.label = data[label_column]
        self.lemma = data[lemma_column]
        self.start1 = data[start1_column]
        self.end1 = data[end1_column]
        self.start2 = data[start2_column]
        self.end2 = data[end2_column]
        self.pos = data[pos_column]

        if self.train_set and vectors_store != None:
            s = data[sentence1_column].copy()
            ss = data[sentence2_column].copy()
            s.append(ss,ignore_index=True)
            _, self.frequency = count_frequencies(s, word_index)
            self.dictionary = copy(word_index)
            self.vectors = vectors_store.clone().detach()
        else:
            self.frequency = None
            self.dictionary = copy(word_index)
 
        self.samples = self._preprocess_samples()

    def _preprocess_samples(self) -> List[Tuple[torch.Tensor,torch.Tensor,torch.Tensor,torch.Tensor]]:
        samples = []
        for sentence1, sentence2, lemma, pst, label, start1, end1, start2, end2 in \
             zip(self.sentence1, self.sentence2, self.lemma, self.pos, self.label, \
                 self.start1, self.end1, self.start2, self.end2):
            sample = preprocess_sample(sentence1, sentence2, lemma, pst, \
                label, start1, end1, start2, end2, self.frequency, self.dictionary, augment = self.train_set)
            samples.extend(sample)
        return samples

    def __len__(self):
        # returns the number of samples in our dataset
      return len(self.samples)

    def __getitem__(self, idx):
        # returns the idx-th sample
        return self.samples[idx]


In [None]:
# let's load the train set
train_dataset = WiCDataset(train_dataframe, word_index, vectors_store, train_set = True)

In [None]:
# let's load the dev set
dev_dataset = WiCDataset(dev_dataframe, word_index, train_set = False)

In [None]:
# how big is the train set?
print(len(train_dataset))
# what's inside a sample?
print(train_dataset[1])

In [None]:
# collate function helps us in batching the data
def rnn_collate_fn(
    data_elements: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

    # data_elements is provided in this form (words1, words2, tags1, tags2, indices, label)
    X1 = [de[0] for de in data_elements]  # list of index tensors
    X2 = [de[1] for de in data_elements]  # list of index tensors

    length = [torch.tensor([len(de[0]),len(de[1])]) for de in data_elements]

    T1 = [de[2] for de in data_elements]  # list of index tensors
    T2 = [de[3] for de in data_elements]  # list of index tensors

    # to implement the many-to-one strategy
    X1 = torch.nn.utils.rnn.pad_sequence(X1, batch_first=True, padding_value=RESERVED_TOKENS['<pad>']) 
    X2 = torch.nn.utils.rnn.pad_sequence(X2, batch_first=True, padding_value=RESERVED_TOKENS['<pad>'])
    # to implement the many-to-one strategy
    T1 = torch.nn.utils.rnn.pad_sequence(T1, batch_first=True, padding_value=POS_NE_TAGS['<pad>']) 
    T2 = torch.nn.utils.rnn.pad_sequence(T2, batch_first=True, padding_value=POS_NE_TAGS['<pad>'])

    batch_size, seq_len1 = X1.shape

    # let's create the sep token
    sep_token = RESERVED_TOKENS['<sep>']*torch.ones(batch_size, 1).type(torch.LongTensor)
    sep_tag_token = POS_NE_TAGS['<sep>']*torch.ones(batch_size, 1).type(torch.LongTensor)

    X_index = torch.stack([torch.tensor([de[4][0],de[4][1]]) for de in data_elements])
    X_length = torch.stack(length)

    y = [de[5] for de in data_elements]
    y = torch.stack(y)
    
    # let's stuck everything
    X = torch.cat([X1,sep_token,X2],dim=-1)
    T = torch.cat([T1,sep_tag_token,T2],dim=-1)

    return X, T, X_index, X_length, y

In [None]:
# let's initialize and visualize what is inside the dataloader
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=rnn_collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=128, shuffle=False, collate_fn=rnn_collate_fn)

for batch in dev_dataloader:
    batch_X, batch_T, batch_X_indices, batch_X_len, batch_y = batch
    print(batch_X)
    print(batch_T)
    print(batch_X_indices.shape)
    print(batch_X_len)
    print(batch_X.shape)
    print(batch_y.shape)
    break

In [None]:
def l1_penalty(params, l1_lambda=0.001):
    """Returns the L1 penalty of the params."""
    l1_norm = sum(p.abs().sum() for p in params)
    return l1_lambda*l1_norm

In [None]:

class Net(nn.Module):
    '''
    Here we define our model
    '''
    def __init__(self) -> None:
        super(Net, self).__init__()
        n_hidden = 256
        self.embedding = torch.nn.Embedding.from_pretrained(vectors_store[:400005])
        # sequence encoder of size 301: 300 word embedding size + 1 one hot 
        # encoding of the position of query lemma
        self.rnn = torch.nn.GRU(input_size=vectors_store.size(1)+1, hidden_size=n_hidden, \
             num_layers=2, batch_first=True, dropout=0.2, bidirectional=False)
        
        self.norm1 = torch.nn.BatchNorm1d(n_hidden*2)
        self.fc1 = torch.nn.Linear(n_hidden*2, n_hidden)
        self.drop1 = nn.Dropout(p=0.2)
        self.norm2 = torch.nn.BatchNorm1d(n_hidden)
        self.fc2 = torch.nn.Linear(n_hidden, 1)
        self.drop2 = nn.Dropout(p=0.2)
        self.loss = nn.BCEWithLogitsLoss(reduction = 'mean')
        self.noise_mean = 0.0
        self.noise_std = vectors_store.std()

    def forward(
        self,
        X: torch.Tensor,
        X_indices: torch.Tensor,
        X_length: torch.Tensor,
        T: Optional[torch.Tensor] = None,
        y: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:

        batch_size, seq_len = X.shape
        # let's find the sep token and divide the sentences
        sep_token = torch.where(X[0]==RESERVED_TOKENS['<sep>'])[0]

        # here I implemented dropwords
        if y is not None and T is not None: 
            bool_mask = torch.empty(batch_size, seq_len).uniform_(0, 1).to(self.device) > 0.5
            mask = torch.ones(batch_size, seq_len).type(torch.LongTensor).to(self.device)
            X = torch.where(bool_mask, X, RESERVED_TOKENS['<drop>']*mask)
            T = torch.where(bool_mask, T, POS_NE_TAGS['<drop>']*mask)
       
        # here we can separate the sentences
        X1 = X[:,:sep_token]
        X2 = X[:,sep_token+1:]
        T1 = T[:,:sep_token]
        T2 = T[:,sep_token+1:]

        pos_tags1 = torch.nn.functional.one_hot(T1, num_classes=len(POS_NE_TAGS)).to(torch.float32)
        pos_tags2 = torch.nn.functional.one_hot(T2, num_classes=len(POS_NE_TAGS)).to(torch.float32)

        # embedding words from indices
        embedding_out1 = self.embedding(X1)
        embedding_out2 = self.embedding(X2)

        # here I added noise to the input to improve generalization
        if y is not None and T is not None:
            embedding_out1 +=  torch.normal(self.noise_mean, \
                1.5*self.noise_std, size=embedding_out1.shape).to(self.device)
            embedding_out2 +=  torch.normal(self.noise_mean, \
                1.5*self.noise_std, size=embedding_out2.shape).to(self.device)

        batch_size, seq_len1, _ = embedding_out1.shape
        _, seq_len2, _ = embedding_out2.shape

        # here I encode the indices of the query lemma
        target1 = torch.nn.functional.one_hot(X_indices[...,0], num_classes=seq_len1)\
            .to(torch.float32).unsqueeze(-1)
        target2 = torch.nn.functional.one_hot(X_indices[...,1], num_classes=seq_len2)\
            .to(torch.float32).unsqueeze(-1)

        embedding_out1 = torch.cat([embedding_out1, target1],dim=-1)
        embedding_out2 = torch.cat([embedding_out2, target2],dim=-1)

        # remove padding
        embedding_out1 = torch.nn.utils.rnn.pack_padded_sequence(embedding_out1, \
            lengths=X_length[...,0].cpu(), batch_first=True, enforce_sorted=False)

        embedding_out2 = torch.nn.utils.rnn.pack_padded_sequence(embedding_out2, \
            lengths=X_length[...,1].cpu(), batch_first=True, enforce_sorted=False)
        # encode the sequence
        recurrent_out1, _ = self.rnn(embedding_out1)
        recurrent_out2, _ = self.rnn(embedding_out2)
        # add padding
        recurrent_out1, _ = torch.nn.utils.rnn.pad_packed_sequence(recurrent_out1, \
                batch_first=True, padding_value=POS_NE_TAGS['<pad>'])

        recurrent_out2, _ = torch.nn.utils.rnn.pad_packed_sequence(recurrent_out2, \
                batch_first=True, padding_value=POS_NE_TAGS['<pad>'])

        # here we utilize the sequences length to retrieve the last token
        # output for each sequence
        batch_size, seq_len1, hidden_size = recurrent_out1.shape
        _, seq_len2, _ = recurrent_out2.shape
        # we flatten the recurrent output
        # now I have a long sequence of batch x seq_len vectors
        flattened_out1 = recurrent_out1.reshape(batch_size * seq_len1, hidden_size)
        flattened_out2 = recurrent_out2.reshape(batch_size * seq_len2, hidden_size)
        # tensor of the start offsets of each element in the batch
        sequences_offsets1 = torch.arange(batch_size, device=self.device) * seq_len1
        sequences_offsets2 = torch.arange(batch_size, device=self.device) * seq_len2
        # and we use a simple trick to compute a tensor of the indices
        # of the last token in each batch element
        vect1 = sequences_offsets1 + X_length[...,0]-1
        vect2 = sequences_offsets2 + X_length[...,1]-1
        
        # we retreive the output of the last token
        out1 = flattened_out1[vect1]
        out2 = flattened_out2[vect2]

        # we concatenate the encoded sequences
        # and send them to the classifier
        out = torch.cat([out1, out2],dim=-1)
        out = self.norm1(out)
        out = self.drop1(out)
        out = self.fc1(out)
        out = torch.relu(out)
        out = self.norm2(out)
        out = self.drop2(out)
        logits = self.fc2(out)
        pred = torch.sigmoid(logits)

        result = {}
        result['pred'] = pred
        result['logits'] = logits

        # compute loss
        if y is not None:
            loss = self.loss(logits, y.to(torch.float32)) 
            result['loss'] = loss
            result['loss_l1'] = loss + l1_penalty(self.fc1.parameters()) \
                 + l1_penalty(self.fc2.parameters())

        return result


In [None]:
def evaluate(net: nn.Module, epoch: int) -> float:
    '''
    This is a support function to evaluate the model
    '''
    net.eval()
    pred = []
    prob = []
    label = []
    for i, data in enumerate(dev_dataloader):
        X, T, X_indices, X_lengths, y = data
        output = net(X.to(device), X_indices.to(device), X_lengths.to(device), T.to(device))
        prob.extend([float(o) for o in output['logits'].detach().cpu().numpy()])
        pred.extend([int(o) for o in output['pred'].detach().round().cpu().numpy()])
        label.extend([int(o) for o in y.cpu().numpy()])
    loss = net.loss(torch.tensor(prob), torch.tensor(label).to(torch.float32))
    print("= val loss: %.3f" % (loss))
    log_metric("dev loss", float(loss.numpy()))
    score = f1_score(label, pred)
    print("= f1 score: %.3f" % (score))
    log_metric("f1 score", score)
    score = precision_score(label, pred)
    print("= precision score: %.3f" % (score))
    log_metric("dev prec", score)
    score = recall_score(label, pred)
    print("= recall score: %.3f" % (score))
    log_metric("dev rec", score)
    score = accuracy_score(label, pred)
    print("= accuracy score: %.3f" % (score))
    log_metric("dev acc", score)
    return score


In [None]:
from mlflow import log_metric, log_param, log_artifacts
import mlflow
mlflow.end_run()
mlflow.start_run()
from datetime import datetime

lr = 0.3
net = Net()
optimizer = torch.optim.Adadelta(net.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[200,300], gamma=0.1)
device = torch.device('cuda')
net.to(device)
net.device = device
print("Device name: ", torch.cuda.get_device_name(torch.cuda.current_device()))

def train(net: nn.Module, optimizer: torch.optim, device: torch.device, epochs: int = 100, \
    accuracy_tolerance: int = 30):
    PATH = datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
    os.makedirs(PATH)
    best_accuracy = 0.0
    tol = 0
    last_path = None
    for epoch in range(epochs):  # loop over the dataset multiple times
        net.train().to(device)
        running_loss = 0.0
        running_tot_loss = 0.0
        for i, data in enumerate(train_dataloader):
            net.train().to(device)
            # get the inputs; data is a list of [inputs, labels]
            X, T, X_indices, X_lengths, y = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(X.to(device), X_indices.to(device), X_lengths.to(device), T.to(device), y.to(device))
            loss = outputs['loss_l1']
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 5)
            optimizer.step()
            # print statistics
            loss = outputs['loss']
            running_loss += loss.item()
            running_tot_loss += loss.item()
            if i % 250 == 249: 
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 250))
                running_loss = 0.0
        scheduler.step()
        log = ('='*15 + ' epoch: %d '+'='*15) % (epoch + 1)
        print(log)
        print('= train loss: %.3f' % (running_tot_loss / len(train_dataloader)))
        log_metric("train loss", running_tot_loss / len(train_dataloader))
        running_tot_loss = 0.0
        accuracy = evaluate(net, epoch+1)
        # keep only best models
        if accuracy > best_accuracy:
            tol = 0
            best_accuracy = accuracy
            path = os.path.join(PATH,'best-%.3f.pt'%(round(best_accuracy,3)))
            torch.save({'epoch': epoch,
                        'model_state_dict': net.state_dict(),
                        'word_index': dict(word_index),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'accuracy': accuracy},
                        path)
            if last_path != None:
                os.remove(last_path)
            last_path = path
        else:
            tol+=1
        print('= best accuracy: %.3f' % (best_accuracy))
        print('= tolerance: %d' % (tol))
        print('='*len(log))
        log_metric("dev best acc", best_accuracy)
        # if tolerance is exceeded stop
        if tol > accuracy_tolerance:
            print('Early stop!')
            break     

    print('Finished Training')

train(net, optimizer, device, 400)

In [None]:
# clean up gpu memory when CUDA memory exceeded is fired
import gc
gc.collect()
net = None
torch.cuda.empty_cache()

In [None]:
net = Net()
device = torch.device('cuda')     # Default CUDA device
net.device=device
net.load_state_dict(torch.load('./model/best-gru2-parameters-.697.pt')['model_state_dict'])
net.eval()

In [None]:
def evaluate(net: nn.Module, epoch: int) -> float:
    net.eval()
    pred = []
    prob = []
    label = []
    for i, data in enumerate(dev_dataloader):
        X, T, X_indices, X_lengths, y = data
        output = net(X.to(device), X_indices.to(device), X_lengths.to(device), T.to(device))
        prob.extend([float(o) for o in output['logits'].detach().cpu().numpy()])
        pred.extend([int(o) for o in output['pred'].detach().round().cpu().numpy()])
        label.extend([int(o) for o in y.cpu().numpy()])
    loss = net.loss(torch.tensor(prob), torch.tensor(label).to(torch.float32))
    print("= val loss: %.3f" % (loss))
    score = f1_score(label, pred)
    print("= f1 score: %.3f" % (score))
    score = precision_score(label, pred)
    print("= precision score: %.3f" % (score))
    score = recall_score(label, pred)
    print("= recall score: %.3f" % (score))
    score = accuracy_score(label, pred)
    print("= accuracy score: %.3f" % (score))
    return score


In [None]:
accuracy = evaluate(net.to(device), 0)

In [None]:
def confusion_matrix(net: nn.Module, epoch: int) -> float:
    import sklearn
    net.cpu().eval()
    pred = []
    prob = []
    label = []
    for i, data in enumerate(dev_dataloader):
        X1, X2, X_lengths, y = data
        output = net(X1, X2, X_lengths)
        prob.extend([float(o) for o in output['pred'].detach().numpy()])
        pred.extend([int(o) for o in output['pred'].detach().round().numpy()])
        label.extend([int(o) for o in y.numpy()])
    return sklearn.metrics.confusion_matrix(label,pred,normalize='true')


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
cm = confusion_matrix(net,0.0)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=['True','False'])
disp.plot()