In [8]:
#Import of all required libraries

from urllib.request import urlretrieve
import tarfile
from nltk.corpus import wordnet 
from nltk.tokenize import WordPunctTokenizer
import sys
from bs4 import BeautifulSoup
import os
from nltk import word_tokenize
import numpy as np
import random
import gensim
from scipy.spatial import distance


## Downloading the datasets - SemCor, Senseval-2 and Senseval-3

In [9]:
# Same methods are followed to download all 3 datasets

#file_url = 'http://web.eecs.umich.edu/~mihalcea/downloads/senseval.semcor/senseval2.semcor.tar.gz'
#fname = 'senseval2.semcor.tar.gz'

#file_url = 'http://web.eecs.umich.edu/~mihalcea/downloads/senseval.semcor/senseval3.semcor.tar.gz'
#fname = 'senseval3.semcor.tar.gz'

file_url = 'http://web.eecs.umich.edu/~mihalcea/downloads/semcor/semcor1.7.tar.gz'
fname = 'semcor1.7.tar.gz'

#Extracting the files in the datasets , needed only once for each dataset

local_filename, _ = urlretrieve(file_url, fname)
tar = tarfile.open(fname, "r:gz")
tar.extractall()
tar.close()


### Mapping the datasets, to get sentences from the dataset

### For each word in sentence, we get its lemma, pos tag and wnsn (ground truth)


In [10]:
#Initializing dictionaries to store sentences in each dataset
data_dict_semcor = {}
data_dict_senseval2 = {}
data_dict_senseval3 = {}


In [11]:
#Dictionary for mapping postags in dataset to postags in AutoExtend

dict_postags = {"NN":"n", "RB":"r", "JJ":"a", "VB":"v", "NNP":"n","NNPS":"n","NP":"n","NPS":"n", "NNS":"n","JJR":"a","JJS":"a", "RBR":"r", "RBS":"r", "VBD":"v", "VBG":"v","VBN":"v","VBZ":"v","VBP":"v","WRB":"r","MD":"v" }


In [12]:
#Reading Semcor dataset

rootdir = 'semcor1.7'
arr_sem_cor=[]
count = 0

#Retrieving the file paths
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if 'br' in file:
            var1=str(os.path.join(subdir, file))
            arr_sem_cor.append(var1)

#Reading the sentences in each file
for url in arr_sem_cor:

    
    # Make a GET request to fetch the raw HTML content
    with open(url) as fp:
        #Reading using Beautiful Soup
        soup = BeautifulSoup(fp, "html.parser")
   
    #Finding all s tags -> denotes start of a sentence
    for i in soup.findAll('s'):
        
        sentence = ""
        value = []
        sub_dict = {}
        
        #Required words are only in wf tags having cmd attribute as "done"
        for j in i.findAll('wf', {"cmd":"done"}):
            
            #Checking whether the word has a lemma attribute
            if (type(j.get("lemma")) == str):
                flag = True
                
                #Checking whether the word has id
                if (not(j.get("ot"))):
                    
                    #Forming sentence by joining the words
                    sentence = sentence + j.text
                    sentence += " "
                    
                    #If the word has a postag which can be mapped
                    if j.get("pos") in dict_postags:

                        #We form the phrase using the attribute values in the word tag (lemma-postag-wnsn )
                        phrase= j.get("lemma")+","+dict_postags[j.get("pos")]+","+j.get("wnsn")

                    else:
                        pass
                    
                    #For each word in a sentence, store the phrase as its value in a dictionary
                    sub_dict[j.text] = phrase
            else:
                flag = False
        
        
        if flag:
            
            #store each sentence in the dataset as index: [sentence, sub_dict] form
            #sub dictionary contains each word in the sentence and its phrase
            value.append(sentence)
            value.append(sub_dict)
            key = count
            data_dict_semcor[key]=value 
            count = count+1

#key - index, value - array with first value as sentence and second value the sub dictionary
#phrase is needed for finding the lexeme embedding and for checking the ground truth (wnsn value)

In [13]:
print("Length of Semcor dataset i.e, number of sentences -",len(data_dict_semcor))

Length of Semcor dataset i.e, number of sentences - 34374


In [14]:
#Taking 5000 random sentences from semcor

index_list = []
for i in range(0,5000):
    x = random.randint(1,34374)
    index_list.append(x)


In [15]:
#Reading senseval2 dataset

#The reading of the dataset and storing in dictionary is in 
#same format as the semcor dataset 

#Setting the required root
rootdir = 'senseval2.semcor'

arr_senseval2_file=[]

#Finding the file paths required
for subdir, dirs, files in os.walk(rootdir):
    
    if subdir=="senseval2.semcor/wordnet1.7.1":
        for file in files:
            var1=str(os.path.join(subdir, file))
            arr_senseval2_file.append(var1)
count=0

#for each file
for url in arr_senseval2_file:

    
    # Make a GET request to fetch the raw HTML content
    html_content = open(url).read()

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")
   
    #Form the sentences and words with its corresponding phrases
    #Steps followed are same as done for Semcor
    
    for i in soup.findAll('s'):
        
        sentence = " "
        value = []
        sub_dict = {}
        for j in i.findAll('wf', {"cmd":"done"}):
            if (type(j.get("lemma")) == str):
                flag = True
                phrase = " "
                if (not(j.get("ot"))):
                    
                    sentence = sentence + j.text
                    sentence += " "
                    if j.get("pos") in dict_postags:

                        phrase= j.get("lemma")+","+dict_postags[j.get("pos")]+","+j.get("wnsn")

                    else:
                        pass
                    temp = j.text
                    sub_dict[j.text] = phrase
            else:
                flag = False
        
        if flag:
            value.append(sentence)
            value.append(sub_dict)
            key = count
            data_dict_senseval2[key]=value 
            count = count+1
#key - index, value - array with first value as sentence and second value the sub dictionary
#phrase - (lemma-postag-wnsn) is stored for each word in sentence as the sub dictionary


In [16]:
print("Number of sentences in senseval2- ",len(data_dict_senseval2))

Number of sentences in senseval2-  238


In [17]:
#Reading senseval3 dataset

#The steps followed for reading senseval3 dataset is similar to the ones 
#followed for the other two datasets. Hence I have not added descriptive comments here.
#please check the above cells for detailed explanation

rootdir = 'senseval3.semcor'

#finding file paths
arr_senseval3_file=[]
for subdir, dirs, files in os.walk(rootdir):
    if subdir=="senseval3.semcor/wordnet1.7.1":
        for file in files:
            var1=str(os.path.join(subdir, file))
            arr_senseval3_file.append(var1)
count=0

#Reading sentences in each file
for url in arr_senseval3_file:

    
    # Make a GET request to fetch the raw HTML content
    html_content = open(url).read()

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")
   
    # s-tag denotes the start of a sentence
    for i in soup.findAll('s'):
        
        sentence = " "
        value = []
        sub_dict = {}
        for j in i.findAll('wf', {"cmd":"done"}):
            if (type(j.get("lemma")) == str):
                flag = True
                phrase = " "
                if (not(j.get("ot"))):
                   
                    sentence = sentence + j.text
                    sentence += " "

                    if j.get("pos") in dict_postags:

                        #phrase-(lemma-postag-wnsn) -> required for lexeme embeddings and ground truth
                        phrase= j.get("lemma")+","+dict_postags[j.get("pos")]+","+j.get("wnsn")

                    else:
                        pass

                    sub_dict[j.text] = phrase
            else:
                flag = False
        
        if flag:
            value.append(sentence)
            value.append(sub_dict)
            key = count
            data_dict_senseval3[key]=value 
            count = count+1
    #key - index, value - array with first value as sentence and second value the sub dictionary



In [18]:
print("Number of sentences in senseval3 -",len(data_dict_senseval3))

Number of sentences in senseval3 - 300


## Read the Lexeme embeddings from AutoExtend

### Pretrained embeddings are downloaded from the AutoExtend


In [19]:
#To get lexeme embeddings, we need to format the word and the corresponding synset taken in a particular way
#format :  lemma-wn-2.1-offset-postag

#We get the format of the word from the dataset  
#we have to do mapping and form the format for the particular synset using offset function

#We know the postag and lemma of the word from the dictionary containing the datasets

#Now from lexems.txt, we need to extract the 300 size vector embedding.


In [20]:
#Reading the lines in the lexemes file.
#This file contains the sense embeddings for each lemma, postag and synset

with open('embeddings 2/lexemes.txt') as f:
    lines = f.read().splitlines()


#We store the tags and embeddings from lexeme.txt to a dictionary dict
lines= lines[1:]
dict = {}
for line in lines:
    #We split and get the embedding part
    key,val = line.split(maxsplit=1)
    numbers_str = val
    numbers_list = []
   
    #Store as vector
    for num_str in numbers_str.split():
        num_int = float(num_str)
        numbers_list.append(num_int)
    
    
    #Sense embedding is stored in a dictionary with key as the tag and 
    #value as the vector embedding
    dict[key]=numbers_list


In [21]:
print("Number of lexeme embeddings-",len(dict))

Number of lexeme embeddings- 103055


### Load pre-trained word embeddings from WordNet

In [22]:
# Load Google's pre-trained Word2Vec model as given in the question
# The word embeddings are taken from this model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  


In [23]:
#Defining the function words to be removed. These words do not contain word embeddings and
# do not have more than one synset.

functionwords = ['about', 'across', 'against', 'along', 'around', 'at',
                 'behind', 'beside', 'besides', 'by', 'despite', 'down',
                 'during', 'for', 'from', 'in', 'inside', 'into', 'near', 'of',
                 'off', 'on', 'onto', 'over', 'through', 'to', 'toward',
                 'with', 'within', 'without', 'anything', 'everything',
                 'anyone', 'everyone', 'ones', 'such', 'it', 'itself',
                 'something', 'nothing', 'someone', 'the', 'some', 'this',
                 'that', 'every', 'all', 'both', 'one', 'first', 'other',
                 'next', 'many', 'much', 'more', 'most', 'several', 'no', 'a',
                 'an', 'any', 'each', 'no', 'half', 'twice', 'two', 'second',
                 'another', 'last', 'few', 'little', 'less', 'least', 'own',
                 'and', 'but', 'after', 'when', 'as', 'because', 'if', 'what',
                 'where', 'which', 'how', 'than', 'or', 'so', 'before', 'since',
                 'while', 'although', 'though', 'who', 'whose', 'can', 'may',
                 'will', 'shall', 'could', 'be', 'do', 'have', 'might', 'would',
                 'should', 'must', 'here', 'there', 'now', 'then', 'always',
                 'never', 'sometimes', 'usually', 'often', 'therefore',
                 'however', 'besides', 'moreover', 'though', 'otherwise',
                 'else', 'instead', 'anyway', 'incidentally', 'meanwhile',')','(','{','}','.',';',',',"'",
                '-',');',"'s",'.);','.,']

## Implement two baseline methods: most common sense and the plain Lesk algorithm.

### Most common sense algorithm

In [18]:
#Initializing
predictions = [] 
targets = []

#Function to return the most frequent sense for a lemma
def baseline(lemma):
    synset = wordnet.synsets(lemma)
    if len(synset) > 0:
        return wordnet.synsets(lemma)[0] 
    # First synset is the most common one
    else:
        return None
    


In [19]:
#Function to return the total number of words taken and the total number of correct senses matched
#Input to function - a sentence and its sub dictionary containing all the words and phrases

def most_common_sense(sentence,data_dict):
    correct = 0
    count = 0
    
    #Tokenize the sentence to get words
    tokens = set(word_tokenize(sentence))
    
    #Iterating through each word
    for tok in tokens:
        
        #Getting the most common synset
        pred = baseline(tok)
        
        #While some synset is returned
        if pred is not None:
           
            #Checking the accuracy
            
            #Retrieving ground truth for the token
            if tok not in data_dict and  tok+'.' in data_dict:
                #Adding to solve tokenization error
                tok=tok+'.'
            
            if(tok in data_dict):
                #getting the lemma, pos and wnsn
                lemma = data_dict[tok].split(',')[0]
                pos = data_dict[tok].split(',')[1]
                wnsn = data_dict[tok].split(',')[2]
                
                #Forming the true synset value
                true_synset = lemma + '.'+pos+'.'+'0'+wnsn

                   
                true_synset_arr=true_synset.split(".")
                predicted_synset_arr=pred.name().split(".")
            
            #Calculating accuracy
            
                #Some case more than one synset may be the ground truth
                if ';' in true_synset_arr[2]:
                    true_synset_arr_1=true_synset_arr[2].split(";")
                    #Form an array of all the true synsets
                    true_synset_arr_1=[i.zfill(2) for i in true_synset_arr_1]
                else:
                    true_synset_arr_1=true_synset_arr[2]
                
                #Checking whether the predicted and true synset is same 
                if (predicted_synset_arr[1]==true_synset_arr[1] and predicted_synset_arr[2] in true_synset_arr_1):
                    
                    #Incrementing the count and correct value
                    correct+=1
                    count+=1
                    
                else:
                    count+=1
                
            
        
        else:
            
            pass
        
    
    #Returning the number of true matches and total count for a sentence
    return correct, count


In [20]:
#Senseval2 computation

correct_total = 0
count_total = 0

#for each sentence
for i in data_dict_senseval2:
    
    #Find the total true matches and counts for all
    correct,count = most_common_sense(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    correct_total +=correct
    count_total +=count
    
#Accuracy calculation
accuracy = correct_total/count_total
print("Accuracy of most common sense for senseval2-", accuracy * 100)

Accuracy of most common sense for senseval2- 47.11578947368421


In [21]:
#Senseval3 computation

#Same steps as done for senseval2

correct_total = 0
count_total = 0
for i in data_dict_senseval2:
    correct,count = most_common_sense(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    correct_total +=correct
    count_total +=count
accuracy = correct_total/count_total
print("Accuracy of most common sense for senseval3-", accuracy * 100)

Accuracy of most common sense for senseval3- 45.96923076923077


In [22]:
#Semcor computation for 5000 random sentences

#Same steps as done for senseval2
correct =0
count = 0

#Take 5000 sentences
final_arr = []

for i in index_list:
    
    correct,count = most_common_sense(data_dict_semcor[i][0],data_dict_semcor[i][1])
    correct_total +=correct
    count_total +=count
accuracy = correct_total/count_total
print("Accuracy of most common sense for 5000 sentences in semcor-", accuracy * 100)
    
    


Accuracy of most common sense for 5000 sentences in semcor- 48.235331592023954


## Plain Lesk Algorithm

In [23]:
#Function to find the number of overlapping words
#Input- synset and the sentence being considered
def overlap_match( synset, sentence ):
    
    #Finding the words in the synset definition
    gloss = set(WordPunctTokenizer().tokenize(synset.definition()))
    
    #Finding the words in the synset examples and adding them to gloss
    for i in synset.examples():
         gloss.union(i)
            
    #Removing the function words from gloss
    gloss = gloss.difference( functionwords )
    
    #Removing the function words from the sentence
    if isinstance(sentence, str):
        sentence = set(sentence.split(" "))
    elif isinstance(sentence, list):
        sentence = set(sentence)
    elif isinstance(sentence, set):
        pass
    else:
        return
    sentence = sentence.difference( functionwords )
    
    #Returning the number of common words between sentence and gloss
    return len( gloss.intersection(sentence) )

#Function to return the best synset for each word in a sentence
#Input- word and its sentence
def lesk( word, sentence ):
    bestsense = None
    maxoverlap = 0
    
    #get the word
    word=wordnet.morphy(word) if wordnet.morphy(word) is not None else word
    
    #For each sense of the word in wordnet
    for sense in wordnet.synsets(word):
        
        #Find its overlap number
        overlap = overlap_match(sense,sentence)
        
        #for each hyponyms in sense, find overlap number and add it
        for h in sense.hyponyms():
            overlap += overlap_match( h, sentence )
        
        #finding the maximum overlapping sense
        if overlap > maxoverlap:
                maxoverlap = overlap
                bestsense = sense
    
    #sense having maximum overlap is returned
    return bestsense

In [24]:
#Implementing plain lesk algorithm 
#Input - each sentence and its sub dictionary containing its words and phrases
def plain_lesk(sentence, data_dict):
    
    correct = 0
    count = 0
    
    #tokenize the sentence
    tokens = set(word_tokenize(sentence))
    
    #For each word in sentence
    for tok in tokens:
        
        #Find best synset 
        best_synset = lesk(tok,sentence)
        
        #if some best synset is returned
        if best_synset is not None:
            
            #Finding the true synset from the sub dictionary
            lemma = data_dict[tok].split(',')[0]
            pos = data_dict[tok].split(',')[1]
            wnsn = data_dict[tok].split(',')[2]
            true_synset = lemma + '.'+pos+'.'+'0'+wnsn
            
            
            true_synset_arr=true_synset.split(".")
            predicted_synset_arr=best_synset.name().split(".")
            
    #Calculating accuracy
            
            
           #If there are more than one true synset, then list is made
            if ';' in true_synset_arr[2]:
                true_synset_arr_1=true_synset_arr[2].split(";")
                true_synset_arr_1=[i.zfill(2) for i in true_synset_arr_1]
            else:
                true_synset_arr_1=true_synset_arr[2]
                
            #checking if predicted and true synsets are same or not
            if (predicted_synset_arr[1]==true_synset_arr[1] and predicted_synset_arr[2] in true_synset_arr_1):
                
                #increment counters
                correct+=1
                count+=1
                
                
            else:
                count+=1
                
            pass
        
        else:
            pass
        
    
    #Return the number of correct matches and the total number  
    return correct, count

In [25]:
#Senseval2 computation for plain lesk

correct_total = 0
count_total = 0

#For each sentence in senseval2
for i in data_dict_senseval2:
    
    #Find the total number of correct matches and the total number of words 
    correct,count = plain_lesk(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    correct_total +=correct
    count_total +=count

#Final accuracy
accuracy = correct_total/count_total


print("Accuracy of plain lesk for senseval2 -", accuracy * 100)

Accuracy of plain lesk for senseval2 - 43.444365698086465


In [26]:
#Senseval3 computation for plain lesk

correct_total = 0
count_total = 0

#For each sentence in senseval3
for i in data_dict_senseval3:
    
     #Find the total number of correct matches and the total number of words 
    correct,count = plain_lesk(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    correct_total +=correct
    count_total +=count

#Final accuracy
accuracy = correct_total/count_total


print("Accuracy of plain lesk for senseval3 -", accuracy * 100)

Accuracy of plain lesk for senseval3 - 37.911646586345384


In [27]:
#Semcor computation for plain lesk

correct_total = 0
count_total = 0

#For each of the selected random 5000 sentences in semcor
for i in index_list:
    
     #Find the total number of correct matches and the total number of words 
    correct,count = plain_lesk(data_dict_semcor[i][0],data_dict_semcor[i][1])
    correct_total +=correct
    count_total +=count

#Final accuracy    
accuracy = correct_total/count_total


print("Accuracy of plain lesk for semcor -", accuracy * 100)

Accuracy of plain lesk for semcor - 38.91576799766832


## Implement the method proposed by Oele and van Noord (2017) using the pre-trained word embeddings.

In [24]:
#Function to implement the baseline paper
#Input - sentence and its sub dictionary

def fun_lesk(sentence,sub_dict):
    
    #Initialization
    correct = 0
    count = 0
    score_list = []
    disambiguated = {}
    
    #get all words in a sentence
    tokens = set(word_tokenize(sentence))
    
    #Sort the words in increasing order of number of synsets
    sorted_words = []
    for w in tokens:
        if len(wordnet.synsets(w))>0:
            sorted_words.append([w,len(wordnet.synsets(w))])
    
    #Sorted_words contain the words in the sentence in increasing order 
    #of their number of synsets available as described in the paper
    sorted_words.sort(key = lambda x: x[1])
    
    
    #for each word in the sorted list
    for word in sorted_words:
        word = word[0]
        
        #for each sense of the word
        for sense in wordnet.synsets(word):
            
            
        #Finding gloss vector for each synset of a word
            #Adding the words in the definition of synset
            gloss = set(WordPunctTokenizer().tokenize(sense.definition()))
            
            #adding the words in the examples of the synset
            for ex in sense.examples():
                tok = set(word_tokenize(ex))
                gloss = gloss.union(tok)
            
            #Now gloss contains all the words in the synset definition and example
            #removing the function words
            gloss = gloss.difference(functionwords)    
            
           
            embedd = []
            cont_emb = []
            context_tokens = []
            
            #for each word in gloss, we find its word embedding and append it
            for i in gloss:
                if i in model.key_to_index:
                    embedd.append(model[i])
                else:
                    pass
            
            #Gloss vector - average of all the word embeddings in the gloss
            #Gloss vector is a 300 vector size embedding
            gloss_vec = [sum(vals)/len(embedd) for vals in zip(*embedd)]  

        
        #To get context embedding for the corresponding word
            
            #tokenize the given sentence and remove the word which is being considered
            #we remove the function words also
            #context will contain all the other words in the sentence
            sen_tokens = set(word_tokenize(sentence))
            sen_tokens = sen_tokens.difference(functionwords)
            
            
            #append all the context words to an array
            for i in sen_tokens:
                if(i!= word):
                    context_tokens.append(i)
           
            #for each word in context, find its embedding
            for i in context_tokens :
                
                #if word already has a predicted synset,
                #if the word in the context was considered previously, then we take
                #the predicted synset's gloss embedding - given in paper
                
                if i in disambiguated:
                    #check the predicted synset value of the word
                    pred = disambiguated[i]
                    
                #find gloss embedding
                    #definition words
                    new_gloss = set(WordPunctTokenizer().tokenize(pred.definition()))
                    
                    #example words
                    for ex in pred.examples():
                        tok = set(word_tokenize(ex))
                        new_gloss = new_gloss.union(tok)
            
            
                    new_gloss = new_gloss.difference(functionwords)    
                    
                    #for each word in gloss, find the word embedding if it is present in the 
                    #word2vec model and append all embeddings to an array
                    for k in new_gloss:
                    
                        if k in model.key_to_index:
    
                            cont_emb.append(model[k])

                
            #if word has not been disambiguated yet, then just take its word embedding from word2vec model 
                else:
                    if i in model.key_to_index:
                        cont_emb.append(model[i])

            
            #context vector for the given word
            #find average of the array storing context embeddings
            context_vec = [sum(vals)/len(cont_emb) for vals in zip(*cont_emb)]   

            
            #Score1
            #cosine of gloss and context vectors - first part
            if(len(gloss)== 0 or len(cont_emb)== 0 or len(embedd)==0): #check for null
            
                sim1 = 0
                
            else:
                sim1 = distance.cosine(gloss_vec,context_vec)
    
    
        #Find Lexeme embedding of word
            if(word in sub_dict):
                
                #get lexeme format from sub dict
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]
                format = lemma+"-wn-2.1-"+str(sense.offset()).zfill(8)+"-"+pos
                

                #extract corresponding lexeme embedding from lexeme.txt file
                # which is stored as dictionary dict with key and value
                if((format in dict) and (len(cont_emb)!=0)): #check null
                    lexeme_embed = dict[format]

                    #Score2
                    #cosine of lexeme embeddings and context vector
                    sim2 = distance.cosine(lexeme_embed,context_vec)
                else:
                    sim2 = 0
            else:
                    sim2 = 0
            
            
            # score value - from paper
            score = sim1 + sim2 
            
            #append the scores of all senses of the word to a list
            score_list.append(score)
         
        #If more than one synset is there for a word, find the best score
        if(len(wordnet.synsets(word))!=0):
            
            #Finding index of max score and finding synset of word in that index
            max_score_index = np.argmax(score_list)
            pred_synset = wordnet.synsets(word)[max_score_index]
            
            #storing the predicted synset for future purposes (context embedding)
            disambiguated[word] = pred_synset
            
           
            
            if word not in sub_dict and  word+'.' in sub_dict: #to solve error in tokenization
                word=word+'.'
            
            #if the corresponding word occurs in sub dictionary
            if word in sub_dict:
                
                #Finding the true synset value from the phrase (wnsn)
                wnsn = sub_dict[word].split(',')[2]
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]

                true_syn = lemma+"."+pos+".0"+wnsn
               
                true_synset_arr=true_syn.split(".")
                predicted_synset_arr=pred_synset.name().split(".")


            # Sometimes more than one true synset occurs, then store in list
                if ';' in true_synset_arr[2]:
                    true_synset_arr_1=true_synset_arr[2].split(";")
                    true_synset_arr_1=[i.zfill(2) for i in true_synset_arr_1]
                else:
                    true_synset_arr_1=true_synset_arr[2]

            #Check whether predicted and true synset values are same or not
            #update counters accordingly
                if (predicted_synset_arr[1]==true_synset_arr[1] and predicted_synset_arr[2] in true_synset_arr_1):
                    correct+=1
                    count+=1
                    
                else:
                    count+=1
                    
                score_list = []
            else:
                score_list = []
        else:
            score_list = []
        
    
#return the number of correct matches and the total count of words considered in the sentence
    return correct, count

## Evaluate the method on the given three datasets

#### Evaluation is similar to what was done for plain lesk and most common sense, only difference is the function being called

In [29]:
#Senseval2 computation
correct =0
count = 0
correct_total = 0
count_total = 0

#for each sentence in dataset
for i in data_dict_senseval2:
    
    correct,count = fun_lesk(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy of Distributional Lesk for senseval2-",accuracy*100)

Accuracy of Distributional Lesk for senseval2- 36.46315789473684


In [30]:
#Senseval3 computation
correct =0
count = 0
correct_total = 0
count_total = 0

#for each sentence in dataset
for i in data_dict_senseval3:
    
    correct,count = fun_lesk(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    count_total += count
    correct_total += correct

accuracy = correct_total/count_total
print("Accuracy of Distributional Lesk for senseval3-",accuracy*100)

Accuracy of Distributional Lesk for senseval3- 37.1973209685729


In [31]:
#Semcor computation

correct_total = 0
count_total = 0

#for each of the 5000 sentences selected
for i in index_list:
    correct,count = fun_lesk(data_dict_semcor[i][0],data_dict_semcor[i][1])
    correct_total +=correct
    count_total +=count
accuracy = correct_total/count_total


print("Accuracy of Distributional Lesk for semcor -", accuracy * 100)

Accuracy of Distributional Lesk for semcor - 36.907521246934735


# Extensions Implementation


## 1. Experiment with removing stopwords and punctuation from the dictionary glosses, sense descriptions and contexts in the occurrences of the words before measuring the distance

In [32]:
#Import nltk stop words
from nltk.corpus import stopwords

#store stopwords in a list
stopwords_list = stopwords.words('english')
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [50]:
#Function to remove stopwords and punctuation from gloss, sense and context embeddings.

#This function is similar to the baseline function, only difference is that
# stop words are being removed before finding each of the embeddings
#Input and output of the function remains the same
def fun_lesk_stopwords(sentence,sub_dict):
    
    #get all words in a sentence
    correct = 0
    count = 0
    score_list = []
    disambiguated = {}
    sorted_words = []
    
    #tokenize the words and remove the stopwords
    tokens = set(word_tokenize(sentence))
    
    tokens = tokens.difference(stopwords_list)
    
    #tokens contain no stopwords
    #Sort the words in increasing order of number of synsets    
    for w in tokens:
        if len(wordnet.synsets(w))>0:
            sorted_words.append([w,len(wordnet.synsets(w))])

    sorted_words.sort(key = lambda x: x[1])
    
    
    #for each word
    for word in sorted_words:
        word = word[0]
        
        #for each sense of the word
        for sense in wordnet.synsets(word):
           
        #Finding gloss vector for each synset of a word
            #gloss definition
            gloss = set(WordPunctTokenizer().tokenize(sense.definition()))
            #gloss examples
            for ex in sense.examples():
                tok = set(word_tokenize(ex))
                gloss = gloss.union(tok)
            
            #remove stopwords and function words from gloss
            gloss = gloss.difference(functionwords)  
            gloss = gloss.difference(stopwords_list)
            
            embedd = []
            cont_emb = []
            context_tokens = []
            
            #append word embeddings for all words in gloss
            for i in gloss:
                if i in model.key_to_index:
                    embedd.append(model[i])
                else:
                       pass
             
            #Gloss vec- avg of all word embeddings in gloss
            gloss_vec = [sum(vals)/len(embedd) for vals in zip(*embedd)]  

        #To get context embedding for the corresponding word
            
            #Tokenize sentence and take all the words except the current word
            sen_tokens = set(word_tokenize(sentence))
            sen_tokens = sen_tokens.difference(functionwords)
            
            #remove stop words from context
            sen_tokens = sen_tokens.difference(stopwords_list)
            for i in sen_tokens:
                if(i!= word):
                    context_tokens.append(i)
            
            #for all context words
            for i in context_tokens :
                
                #if word already has a predicted synset
                if i in disambiguated:
                    
                    #find predicted sense 
                    pred = disambiguated[i]
                    
                    #Find gloss for predicted synset
                    new_gloss = set(WordPunctTokenizer().tokenize(pred.definition()))
                    for ex in pred.examples():
                        tok = set(word_tokenize(ex))
                        new_gloss = new_gloss.union(tok)
            
            
                    new_gloss = new_gloss.difference(functionwords) 
                #removing stopwords here also
                    new_gloss = new_gloss.difference(stopwords_list)
                    
                    #finding word embeddings for all words in gloss and appending
                    for k in new_gloss:
                        if k in model.key_to_index:
                            cont_emb.append(model[k])

                    
                #word is not disambiguated, find its word embedding 
                else:
                    if i in model.key_to_index:
                        cont_emb.append(model[i])

            #context vector for the given word - average of context embeddings
            context_vec = [sum(vals)/len(cont_emb) for vals in zip(*cont_emb)]   

            #Score 1
            #cosine of gloss and context vectors - first part
            if(len(gloss)== 0 or len(cont_emb)== 0 or len(embedd)==0):#nullcheck
            
                sim1 = 0
                
            else:
                sim1 = distance.cosine(gloss_vec,context_vec)
    
            if(word in sub_dict):
            #Getting lexeme embeddings - similar to baseline (no change)
                #lexeme format from sub dict
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]
                format = lemma+"-wn-2.1-"+str(sense.offset()).zfill(8)+"-"+pos
                

                #extract corresponding lexeme embedding from lexeme.txt file
                # which isstored as dictionary dict with key and value
                if((format in dict) and (len(cont_emb)!=0)): #null check
                    lexeme_embed = dict[format]

                    #Score 2
                    #cosine of lexeme embeddings and context vector
                    sim2 = distance.cosine(lexeme_embed,context_vec)
                else:
                    sim2 = 0
            else:
                    sim2 = 0
            
            # score value
            score = sim1 + sim2 
           
            score_list.append(score)
           
        #Finding maximum score among all synsets
        if(len(wordnet.synsets(word))!=0):
            
            #Finding predicted synset and storing it
            max_score_index = np.argmax(score_list) 
            pred_synset = wordnet.synsets(word)[max_score_index]

            disambiguated[word] = pred_synset
            
                
            if word not in sub_dict and  word+'.' in sub_dict:
                word=word+'.'
            
            #True synset calculation
            if word in sub_dict:
                wnsn = sub_dict[word].split(',')[2]
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]

                true_syn = lemma+"."+pos+".0"+wnsn


                true_synset_arr=true_syn.split(".")
                predicted_synset_arr=pred_synset.name().split(".")


                #When more than one true synset is present
                if ';' in true_synset_arr[2]:
                    true_synset_arr_1=true_synset_arr[2].split(";")
                    true_synset_arr_1=[i.zfill(2) for i in true_synset_arr_1]
                else:
                    true_synset_arr_1=true_synset_arr[2]

                #comparison of true and predicted synset
                if (predicted_synset_arr[1]==true_synset_arr[1] and predicted_synset_arr[2] in true_synset_arr_1):
                    correct+=1
                    count+=1
                    
                else:
                    count+=1
                   
                score_list = []
            else:
                score_list = []
        else:
            score_list = []
        
    
    #return the matching and total count    
    return correct, count

### Evaluation on 3 datasets

#### Evaluation is the same method for all datasets as seen in baseline

In [51]:
#Senseval2 computation - stopwords

correct =0
count = 0
correct_total = 0
count_total = 0
for i in data_dict_senseval2:
    correct,count = fun_lesk_stopwords(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy after removing stopwords on senseval2 -",accuracy*100)

Accuracy after removing stopwords on senseval2 - 35.69804456571169


In [52]:
#Senseval3 computation - stopwords

correct =0
count = 0
correct_total = 0
count_total = 0
for i in data_dict_senseval3:
    correct,count = fun_lesk_stopwords(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy after removing stopwords on senseval3 -",accuracy*100)

Accuracy after removing stopwords on senseval3 - 36.348219332956475


In [53]:
#Semcor computation - stopwords

correct =0
count = 0
correct_total = 0
count_total = 0
#Take 5000 sentences
for i in index_list:
    correct,count = fun_lesk_stopwords(data_dict_semcor[i][0],data_dict_semcor[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy after removing stopwords on semcor -",accuracy*100)

Accuracy after removing stopwords on semcor - 36.70728015740881


### 2. SemCor data come from the Brown corpus. The Brown corpus consists of texts from different text categories (see e.g. https://www1.essex.ac.uk/ linguistics/external/clmt/w3c/corpus_ling/content/corpora/list/private/brown/brown.html). Evaluate the results for individual categories.

In [61]:
#Import the Brown corpus from nltk
from nltk.corpus import brown

#To get all the brown categories
brown.categories()

#After trial and error, categories having most number of sentences-
print(brown.fileids(['hobbies']))
print(brown.fileids(['news']))
print(brown.fileids(['adventure']))

rootdir = 'semcor1.7'

['ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', 'ce08', 'ce09', 'ce10', 'ce11', 'ce12', 'ce13', 'ce14', 'ce15', 'ce16', 'ce17', 'ce18', 'ce19', 'ce20', 'ce21', 'ce22', 'ce23', 'ce24', 'ce25', 'ce26', 'ce27', 'ce28', 'ce29', 'ce30', 'ce31', 'ce32', 'ce33', 'ce34', 'ce35', 'ce36']
['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', 'ca08', 'ca09', 'ca10', 'ca11', 'ca12', 'ca13', 'ca14', 'ca15', 'ca16', 'ca17', 'ca18', 'ca19', 'ca20', 'ca21', 'ca22', 'ca23', 'ca24', 'ca25', 'ca26', 'ca27', 'ca28', 'ca29', 'ca30', 'ca31', 'ca32', 'ca33', 'ca34', 'ca35', 'ca36', 'ca37', 'ca38', 'ca39', 'ca40', 'ca41', 'ca42', 'ca43', 'ca44']
['cn01', 'cn02', 'cn03', 'cn04', 'cn05', 'cn06', 'cn07', 'cn08', 'cn09', 'cn10', 'cn11', 'cn12', 'cn13', 'cn14', 'cn15', 'cn16', 'cn17', 'cn18', 'cn19', 'cn20', 'cn21', 'cn22', 'cn23', 'cn24', 'cn25', 'cn26', 'cn27', 'cn28', 'cn29']


In [70]:
print(brown.categories())
print(len(brown.fileids(['belles_lettres'])))
print(brown.fileids(['belles_lettres']))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
75
['cg01', 'cg02', 'cg03', 'cg04', 'cg05', 'cg06', 'cg07', 'cg08', 'cg09', 'cg10', 'cg11', 'cg12', 'cg13', 'cg14', 'cg15', 'cg16', 'cg17', 'cg18', 'cg19', 'cg20', 'cg21', 'cg22', 'cg23', 'cg24', 'cg25', 'cg26', 'cg27', 'cg28', 'cg29', 'cg30', 'cg31', 'cg32', 'cg33', 'cg34', 'cg35', 'cg36', 'cg37', 'cg38', 'cg39', 'cg40', 'cg41', 'cg42', 'cg43', 'cg44', 'cg45', 'cg46', 'cg47', 'cg48', 'cg49', 'cg50', 'cg51', 'cg52', 'cg53', 'cg54', 'cg55', 'cg56', 'cg57', 'cg58', 'cg59', 'cg60', 'cg61', 'cg62', 'cg63', 'cg64', 'cg65', 'cg66', 'cg67', 'cg68', 'cg69', 'cg70', 'cg71', 'cg72', 'cg73', 'cg74', 'cg75']


#### Reading files in category belles_lettres

In [71]:
#Files in category belles_lettres

#storing required file paths
arr_sem_cor_file_belle=[]
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #found from documentation
        if 'br-g' in file:
            var1=str(os.path.join(subdir, file))
            arr_sem_cor_file_belle.append(var1)


In [72]:
counter=0
data_dict_semcor_belle = {}

#Reading and storing sentences in belle category files

#Steps to read and store sentences and subdictionary containing words in sentence and its phrase
#is same as done in earlier cases - for baseline
#only the file paths change - corresponding category only
for file in arr_sem_cor_file_belle:
    
    with open(file) as fp:
        soup = BeautifulSoup(fp, "html.parser")
    
    tags = soup.find_all("s")
    for i in tags:
        tag=i.find_all("wf", {"cmd":"done"})
        
        sen1=""
        dict3={}
        for j in tag:
            if type(j.get("lemma"))==str:
                flag=True
                if (not (j.get("ot"))):
                    sen1+=j.text
                    sen1+=" "
                    key=j.text
                    if j.get("pos") in dict_postags.keys():
                        pos=dict_postags[j.get("pos")]
                    else:
                        print("Pos not available")
                    value=j.get("lemma")+","+pos+","+j.get("wnsn")
                    dict3[key]=value
            else:
                flag=False
        
        if flag:
            key=counter
            value=sen1
            data_dict_semcor_belle[key]=[value,dict3]
            counter+=1
          

In [73]:
print("Number of sentences in Belles_lettres semcor category-",len(data_dict_semcor_belle))

Number of sentences in Belles_lettres semcor category- 2340


#### Reading files in category news

In [38]:
#Files in category News

#storing required file paths
arr_sem_cor_file_news=[]
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #found from documentation
        if 'br-a' in file:
            var1=str(os.path.join(subdir, file))
            arr_sem_cor_file_news.append(var1)


In [40]:
counter=0
data_dict_semcor_news = {}

#Reading and storing sentences in news category files

#Steps to read and store sentences and subdictionary containing words in sentence and its phrase
#is same as done in earlier cases - for baseline
#only the file paths change - corresponding category only
for file in arr_sem_cor_file_news:
    
    with open(file) as fp:
        soup = BeautifulSoup(fp, "html.parser")
    
    tags = soup.find_all("s")
    for i in tags:
        tag=i.find_all("wf", {"cmd":"done"})
        
        sen1=""
        dict3={}
        for j in tag:
            if type(j.get("lemma"))==str:
                flag=True
                if (not (j.get("ot"))):
                    sen1+=j.text
                    sen1+=" "
                    key=j.text
                    if j.get("pos") in dict_postags.keys():
                        pos=dict_postags[j.get("pos")]
                    else:
                        print("Pos not available")
                    value=j.get("lemma")+","+pos+","+j.get("wnsn")
                    dict3[key]=value
            else:
                flag=False
        
        if flag:
            key=counter
            value=sen1
            data_dict_semcor_news[key]=[value,dict3]
            counter+=1
          

In [41]:
print("Number of sentences in News semcor category-",len(data_dict_semcor_news))

Number of sentences in News semcor category- 3819


#### Reading files in category hobbies

In [42]:
#Files in category Hobbies

arr_sem_cor_file_hobbies=[]
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if 'br-e' in file:
            var1=str(os.path.join(subdir, file))
            arr_sem_cor_file_hobbies.append(var1)


In [43]:
counter=0
data_dict_semcor_hobbies = {}

#Reading and storing sentences in hobbies category files

#Steps to read and store sentences and subdictionary containing words in sentence and its phrase
#is same as done in earlier cases - for baseline
#only the file paths change - corresponding category only

for file in arr_sem_cor_file_hobbies:
    with open(file) as fp:
        soup = BeautifulSoup(fp, "html.parser")
    
    tags = soup.find_all("s")
    
    for i in tags:
        tag=i.find_all("wf", {"cmd":"done"})
        
        sen1=""
        dict3={}
        for j in tag:
            if type(j.get("lemma"))==str:
                flag=True
                if (not (j.get("ot"))):
                    sen1+=j.text
                    sen1+=" "
                    key=j.text
                    if j.get("pos") in dict_postags.keys():
                        pos=dict_postags[j.get("pos")]
                    else:
                        print("Pos not available")
                    value=j.get("lemma")+","+pos+","+j.get("wnsn")
                    dict3[key]=value
            else:
                flag=False
        
        if flag:
            key=counter
            value=sen1
            data_dict_semcor_hobbies[key]=[value,dict3]
            counter+=1
          

In [44]:
print("Number of sentences in Hobbies semcor category-",len(data_dict_semcor_hobbies))

Number of sentences in Hobbies semcor category- 2963


#### Reading files in category adventure

In [45]:
#Files in category adventure

arr_sem_cor_file_adv=[]
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if 'br-n' in file:
            var1=str(os.path.join(subdir, file))
            arr_sem_cor_file_adv.append(var1)


In [46]:
counter=0
data_dict_semcor_adv = {}

#Reading and storing sentences in adventure category files

#Steps to read and store sentences and subdictionary containing words in sentence and its phrase
#is same as done in earlier cases - for baseline
#only the file paths change - corresponding category only

for file in arr_sem_cor_file_adv:
    with open(file) as fp:
        soup = BeautifulSoup(fp, "html.parser")
    
    tags = soup.find_all("s")
    
    for i in tags:
        tag=i.find_all("wf", {"cmd":"done"})
        
        sen1=""
        dict3={}
        for j in tag:
            if type(j.get("lemma"))==str:
                flag=True
                if (not (j.get("ot"))):
                    sen1+=j.text
                    sen1+=" "
                    key=j.text
                    if j.get("pos") in dict_postags.keys():
                        pos=dict_postags[j.get("pos")]
                    else:
                        print("Pos not available")
                    value=j.get("lemma")+","+pos+","+j.get("wnsn")
                    dict3[key]=value
            else:
                flag=False
        
        if flag:
            key=counter
            value=sen1
            data_dict_semcor_adv[key]=[value,dict3]
            counter+=1
          

In [47]:
print("Number of sentences in Adventure semcor category-",len(data_dict_semcor_adv))

Number of sentences in Adventure semcor category- 2592


### Evaluation on News category

#### Steps for evaluation are same as done in baseline, the same baseline function is called here
#### The dataset taken is restricted to news category


In [48]:
#Semcor news computation
correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_semcor_news:
    
    correct,count,acc_list = fun_lesk(data_dict_semcor_news[i][0],data_dict_semcor_news[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy of Distributational lesk on News category -",accuracy*100)


Accuracy of Distributational lesk on News category - 34.03571695434506


### Evaluation on Hobbies category

#### Steps for evaluation are same as done in baseline, the same baseline function is called here
#### The dataset taken is restricted to hobbies category

In [49]:
#Semcor hobbies computation
correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_semcor_hobbies:
    
    correct,count,acc_list = fun_lesk(data_dict_semcor_hobbies[i][0],data_dict_semcor_hobbies[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy of Distributational lesk on Hobbies category -",accuracy*100)


Accuracy of Distributational lesk on Hobbies category - 35.78947368421053


### Evaluation on Adventure category


#### Steps for evaluation are same as done in baseline, the same baseline function is called here
#### The dataset taken is restricted to adventure category

In [54]:
#Semcor Adventure computation
correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_semcor_adv:
    
    correct,count,acc_list = fun_lesk(data_dict_semcor_adv[i][0],data_dict_semcor_adv[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy of Distributational lesk on Adventure category -",accuracy*100)


Accuracy of Distributational lesk on Adventure category - 37.45493871665465


### Evaluation on Belles_Lettres category


#### Steps for evaluation are same as done in baseline, the same baseline function is called here
#### The dataset taken is restricted to belles_lettres category

In [75]:
#Semcor Belles_lettres computation
correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_semcor_belle:
    
    correct,count = fun_lesk(data_dict_semcor_belle[i][0],data_dict_semcor_belle[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy of Distributational lesk on Belle_lettres category -",accuracy*100)


Accuracy of Distributational lesk on Belle_lettres category - 37.42560328968727


## 3. Train your own word embeddings for this task, possibly initializing the embeddings with pre-trained embeddings.

In [40]:
#import required libraries
import multiprocessing
cores = multiprocessing.cpu_count()

#Word2vec model of vector_size 300, window size 3 and minimum count 1 is taken

model1 = gensim.models.Word2Vec(min_count=1,
                                   window=3,
                                   vector_size=300, 
                                   workers=cores,
                                   sg=1
                                   )

In [51]:


##Building Vocab for training our own model by combining all the datasets
dict_vocab_list=[data_dict_semcor,data_dict_senseval2,data_dict_senseval3]
sentences_full=[]

for dict_item in dict_vocab_list:
    for j in dict_item.keys():
        sentences_full.append(word_tokenize(dict_item[j][0]))       


In [55]:
# defining the vocabulary based on our data

#data for vocab is an array of tokenized sentences
model1.build_vocab(sentences_full, progress_per=10000)



# training the model
model1.train(sentences_full, total_examples=model1.corpus_count, epochs=10, report_delay=1)
model1.init_sims(replace=True)


  model1.init_sims(replace=True)


In [43]:
#Extending the baseline function to handle our trained model. Here word embeddings are accesed by
# model1.wv

#Rest all steps remain the same
def fun_lesk_own_embeddings(sentence,sub_dict):
    
    #get all words in a sentence
    correct = 0
    count = 0
    score_list = []
    sorted_words = []
    disambiguated = {}
    
    #tokenize sentence
    tokens = set(word_tokenize(sentence))
    
    #Sort the words in increasing order of number of synsets  
    for w in tokens:
        if len(wordnet.synsets(w))>0:
            sorted_words.append([w,len(wordnet.synsets(w))])

    sorted_words.sort(key = lambda x: x[1])
    
    #for each word
    for word in sorted_words:
        word = word[0]
        
        #for each sense of the word       
        for sense in wordnet.synsets(word):
            
            
            #Finding gloss vector for each synset of a word
            #gloss - definition and example words
            gloss = set(WordPunctTokenizer().tokenize(sense.definition()))
            
            for ex in sense.examples():
                tok = set(word_tokenize(ex))
                gloss = gloss.union(tok)
            
            gloss = gloss.difference(functionwords)    
            

            embedd = []
            cont_emb = []
            context_tokens = []
            
            #Find newly trained model embeddings for the words in gloss
            for i in gloss:
                if i in model1.wv.key_to_index:
                    embedd.append(model1.wv[i])
                else:
                    pass
            
            #gloss vector calculation
            gloss_vec = [sum(vals)/len(embedd) for vals in zip(*embedd)]  

            #To get context embedding for the corresponding word

            sen_tokens = set(word_tokenize(sentence))
            sen_tokens = sen_tokens.difference(functionwords)
            
            #Get context words
            for i in sen_tokens:
                if(i!= word):
                    context_tokens.append(i)
            

            for i in context_tokens :
                
                
                #if word already has a predicted synset
                if i in disambiguated:
                    pred = disambiguated[i]
                    #find gloss embedding
                    new_gloss = set(WordPunctTokenizer().tokenize(pred.definition()))
                    for ex in pred.examples():
                        tok = set(word_tokenize(ex))
                        new_gloss = new_gloss.union(tok)
            
            
                    new_gloss = new_gloss.difference(functionwords)    
                    for k in new_gloss:
                    #Find embedding from new model
                        if k in model1.wv.key_to_index:

                            cont_emb.append(model1.wv[k])  
                
                #if word is not disambiguated, then use new model to get context embedding
                else:
                    if i in model1.wv.key_to_index:

                        cont_emb.append(model1.wv[i])

            #context vector for the given word
            context_vec = [sum(vals)/len(cont_emb) for vals in zip(*cont_emb)]   

            #score1
            #cosine of gloss and context vectors - first part
            if(len(gloss)== 0 or len(cont_emb)== 0 or len(embedd)==0):#null check
            
                sim1 = 0
                
            else:
                sim1 = distance.cosine(gloss_vec,context_vec)
    
    #Get lexeme embeddings - same as baseline
            if(word in sub_dict):
                #lexeme format from sub dict
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]
                format = lemma+"-wn-2.1-"+str(sense.offset()).zfill(8)+"-"+pos
                

                #extract corresponding lexeme embedding from lexeme.txt file
                # and stored as dictionary dict with key and value
                if((format in dict) and (len(cont_emb)!=0)):
                    lexeme_embed = dict[format]

                    #score2
                    #cosine of lexeme embeddings and context vector
                    sim2 = distance.cosine(lexeme_embed,context_vec)
                else:
                    sim2 = 0
            else:
                    sim2 = 0
            # score value
            score = sim1 + sim2 
            
            score_list.append(score)
        
        #Finding predicted synset
        if(len(wordnet.synsets(word))!=0):
           
            max_score_index = np.argmax(score_list)
            pred_synset = wordnet.synsets(word)[max_score_index]
            disambiguated[word] = pred_synset
            
           
            if word not in sub_dict and  word+'.' in sub_dict:
                word=word+'.'
            
            #Finding true synset
            if word in sub_dict:
                
                wnsn = sub_dict[word].split(',')[2]
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]

                true_syn = lemma+"."+pos+".0"+wnsn
              
                true_synset_arr=true_syn.split(".")
                predicted_synset_arr=pred_synset.name().split(".")

            #If true synset has more than 1 value, then use list
                if ';' in true_synset_arr[2]:
                    true_synset_arr_1=true_synset_arr[2].split(";")
                    true_synset_arr_1=[i.zfill(2) for i in true_synset_arr_1]
                else:
                    true_synset_arr_1=true_synset_arr[2]

            #Compare predicted and true synset values
                if (predicted_synset_arr[1]==true_synset_arr[1] and predicted_synset_arr[2] in true_synset_arr_1):
                    correct+=1
                    count+=1
                    
                else:
                    count+=1
                    
                score_list = []
            else:
                score_list = []
        else:
            score_list = []
        
    #return the number of correct matches and the total count       
    return correct, count

### Evaluation of datasets

#### Evaluation step is same as that for baseline

In [60]:
#Senseval2 computation - own embeddings
correct =0
count = 0
correct_total = 0
count_total = 0
for i in data_dict_senseval2:
    correct,count = fun_lesk_own_embeddings(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using our own embeddings on senseval2 -",accuracy*100)

Accuracy using our own embeddings on senseval2 - 37.68421052631579


In [61]:
#Senseval3 computation - own embeddings
correct =0
count = 0
correct_total = 0
count_total = 0
for i in data_dict_senseval3:
    correct,count = fun_lesk_own_embeddings(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using our own embeddings on senseval3 -",accuracy*100)

Accuracy using our own embeddings on senseval3 - 32.045337454920144


In [62]:
#Semcor computation - own embeddings
correct =0
count = 0
correct_total = 0
count_total = 0
for i in index_list:
    correct,count = fun_lesk_own_embeddings(data_dict_semcor[i][0],data_dict_semcor[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using our own embeddings on semcor -",accuracy*100)

Accuracy using our own embeddings on semcor - 36.242399811884844


## 4. Use several pre-trained embeddings or train embeddings with various parameter settings (you probably need to make big changes so you actually get significantly different results for WSD) and study the influence of the used embeddings on the disambiguation task

In [57]:
#Here I experimented mainly with window size. When I decrease window size, accuracy decreased
#On increasing window size, accuracy also increases but only upto a certain point.

#Min count and vector size cannot be change as it needs to be compatable with the lexeme embeddings
model1 = gensim.models.Word2Vec(min_count=1,
                                   window=6,
                                   vector_size=300, 
                                   workers=cores,
                                   sg=1
                                   )

### Evaluation on 3 datasets

#### Only the model used changes, everything else remains the same

In [58]:
#Senseval2 computation - own embeddings

correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_senseval2:
    correct,count = fun_lesk_own_embeddings(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using our own embeddings on senseval2 with parameter change -",accuracy*100)

Accuracy using our own embeddings on senseval2 with parameter change - 47.11578947368421


In [46]:
#Senseval3 computation - own embeddings
correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_senseval3:
    correct,count = fun_lesk_own_embeddings(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using our own embeddings on senseval3 with parameter change -",accuracy*100)

Accuracy using our own embeddings on senseval3 with parameter change - 45.69809376609995


In [67]:
#Semcor computation - own embeddings
correct =0
count = 0
correct_total = 0
count_total = 0

for i in index_list:
    correct,count = fun_lesk_own_embeddings(data_dict_semcor[i][0],data_dict_semcor[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using our own embeddings on semcor with parameter change-",accuracy*100)

Accuracy using our own embeddings on semcor with parameter change- 48.35903120696026


## 5. Extend the word embedding model to also use character-based representations, e.g. fastText or flair embeddings

### Implementing FastText embeddings

In [68]:
#Import fastText
from gensim.models.fasttext import FastText

#Specifying the required parameters
embedding_size = 300
window_size = 6
min_word = 1
down_sampling = 1e-2
 
#FastText model, with parameters. Train the model on all the sentences in all datasets
model_fast = FastText(sentences_full,
                      vector_size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      workers = cores,
                      sg=1)

#Call baseline func using this model 

# Evaluate on all datasets

In [69]:
#The function used is same as baseline function
#Only the model used differs for word embeddings

def fun_lesk_fastText(sentence,sub_dict):
    
    correct = 0
    count = 0
    score_list = []
    sorted_words = []
    disambiguated = {}
    
    #get all words in a sentence
    tokens = set(word_tokenize(sentence))
    
    #Sort the words in increasing order of number of synsets 
    for w in tokens:
        if len(wordnet.synsets(w))>0:
            sorted_words.append([w,len(wordnet.synsets(w))])

    sorted_words.sort(key = lambda x: x[1])
    

    #for each word
    for word in sorted_words:
        word = word[0]
        
        #for each sense of the word
        for sense in wordnet.synsets(word):
            
            #Finding gloss vector for each synset of a word
            gloss = set(WordPunctTokenizer().tokenize(sense.definition()))
            for ex in sense.examples():
                tok = set(word_tokenize(ex))
                gloss = gloss.union(tok)
             
            gloss = gloss.difference(functionwords)    
            
            context_tokens = []
            embedd = []
            cont_emb = []
            
            #Find fastText embeddings for all words in gloss
            for i in gloss:
                if i in model_fast.wv.key_to_index:
                    embedd.append(model_fast.wv[i])
                else:
                    pass
            
            #gloss vector calculation
            gloss_vec = [sum(vals)/len(embedd) for vals in zip(*embedd)]  

            #To get context embedding for the corresponding word

            sen_tokens = set(word_tokenize(sentence))
            sen_tokens = sen_tokens.difference(functionwords)
            
            #find context words
            for i in sen_tokens:
                if(i!= word):
                    context_tokens.append(i)
           
            #for all context words
            for i in context_tokens :
                
                #if word already has a predicted synset
                if i in disambiguated:
                    pred = disambiguated[i]
                    #find gloss embedding
                    new_gloss = set(WordPunctTokenizer().tokenize(pred.definition()))
                    for ex in pred.examples():
                        tok = set(word_tokenize(ex))
                        new_gloss = new_gloss.union(tok)
            
                    new_gloss = new_gloss.difference(functionwords)    
                    #find fastText embedding for gloss
                    for k in new_gloss: 
                        if k in model_fast.wv.key_to_index:
                            cont_emb.append(model_fast.wv[k])

                #if word is not disambiguated, find fastText embedding of word   
                else:
                    if i in model_fast.wv.key_to_index:

                        cont_emb.append(model_fast.wv[i])

            #context vector for the given word
            context_vec = [sum(vals)/len(cont_emb) for vals in zip(*cont_emb)]   

            #cosine of gloss and context vectors - first part
            if(len(gloss)== 0 or len(cont_emb)== 0 or len(embedd)==0):#null check
                sim1 = 0
                
            else:
                sim1 = distance.cosine(gloss_vec,context_vec)
    
    #getting lexeme embedding - same as baseline
            if(word in sub_dict):
                #lexeme format from sub dict
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]
                format = lemma+"-wn-2.1-"+str(sense.offset()).zfill(8)+"-"+pos
                

                #extract corresponding lexeme embedding from lexeme.txt file
                # and stored as dictionary dict with key and value
                if((format in dict) and (len(cont_emb)!=0)):
                    lexeme_embed = dict[format]

                    #cosine of lexeme embeddings and context vector
                    sim2 = distance.cosine(lexeme_embed,context_vec)
                else:
                    sim2 = 0
            else:
                    sim2 = 0
            #score value
            score = sim1 + sim2 
           
            score_list.append(score)
           
        
        if(len(wordnet.synsets(word))!=0):
            #Find predicted synset
            max_score_index = np.argmax(score_list)
            pred_synset = wordnet.synsets(word)[max_score_index]
            disambiguated[word] = pred_synset
            
           
            if word not in sub_dict and  word+'.' in sub_dict:
                word=word+'.'
            
            if word in sub_dict:
                #Find true synset
                wnsn = sub_dict[word].split(',')[2]
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]

                true_syn = lemma+"."+pos+".0"+wnsn
               

                true_synset_arr=true_syn.split(".")
                predicted_synset_arr=pred_synset.name().split(".")

            #If true synset has more than one value - list
                if ';' in true_synset_arr[2]:
                    true_synset_arr_1=true_synset_arr[2].split(";")
                    true_synset_arr_1=[i.zfill(2) for i in true_synset_arr_1]
                else:
                    true_synset_arr_1=true_synset_arr[2]
            
            #comparison of predicted and true synsets for accuracy
                if (predicted_synset_arr[1]==true_synset_arr[1] and predicted_synset_arr[2] in true_synset_arr_1):
                    correct+=1
                    count+=1
                    
                else:
                    count+=1
                    
                score_list = []
            else:
                score_list = []
        else:
            score_list = []
        
    
    #return the number of matching words and total words considered   
    return correct, count

### Evaluation on 3 datasets

#### Steps for evaluation remain the same as baseline, only the function called differs

In [70]:
#Senseval2 computation - fastText embeddings

correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_senseval2:
    correct,count = fun_lesk_fastText(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using fastText embeddings on senseval2 -",accuracy*100)

Accuracy using fastText embeddings on senseval2 - 39.78947368421053


In [71]:
#Senseval3 computation - fastText embeddings

correct =0
count = 0
correct_total = 0
count_total = 0
for i in data_dict_senseval3:
    correct,count = fun_lesk_fastText(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using fastText embeddings on senseval3 -",accuracy*100)

Accuracy using fastText embeddings on senseval3 - 33.69397217928903


In [72]:
#Semcor computation - fastText embeddings

correct =0
count = 0
correct_total = 0
count_total = 0
for i in index_list:
    correct,count = fun_lesk_fastText(data_dict_semcor[i][0],data_dict_semcor[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using fastText embeddings on semcor -",accuracy*100)

Accuracy using fastText embeddings on semcor - 37.159461184453626


## 6. Use transformers and sentence embeddings to compare a sentence and a gloss. E.g. you could use the SBERT pre-trained models. Use a part of the data to tune the transformer and classification model

In [5]:
#Import required libraries
from sentence_transformers import SentenceTransformer, models
from torch import nn


#word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
#pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
#dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=300, activation_function=nn.Tanh())


#Sentence embedding pretrained model
#model_sbert = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])


#Pretrained SBERT embedding model
model_sbert = SentenceTransformer('all-MiniLM-L6-v2')


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [32]:
#Function to extend baseline to incorporate sentence embeddings in gloss and context using sbert model
#Input and output of function remains the same as baseline

def fun_lesk_sbert(sentence,sub_dict):
    
    
    correct = 0
    count = 0
    score_list = []
    sorted_words = []
    disambiguated = {}
    
    #get all words in a sentence
    tokens = set(word_tokenize(sentence))
    
    #Sort the words in increasing order of number of synsets
    for w in tokens:
        if len(wordnet.synsets(w))>0:
            sorted_words.append([w,len(wordnet.synsets(w))])

    sorted_words.sort(key = lambda x: x[1])
    
 
    #for each word
    for word in sorted_words:
        word = word[0]
        
        #for each sense of the word
        for sense in wordnet.synsets(word):
            gloss_arr = []
            cont_emb = []
            context_tokens = []
            
            #Finding gloss vector for each synset of a word
            
            #Sentence embedding for sense definition
            gloss_arr.append(model_sbert.encode(sense.definition()))
            
            #Sentence embedding for sense examples
            for ex in sense.examples():
                gloss_arr.append(model_sbert.encode(ex))
            
            #gloss vector calculation - average of the sentence embeddings
            gloss_vec = [sum(vals)/len(gloss_arr) for vals in zip(*gloss_arr)]  
            
            
            #To get context embedding for the corresponding word
            #Same steps as that of baseline
            sen_tokens = set(word_tokenize(sentence))
            sen_tokens = sen_tokens.difference(functionwords)
            
            
            #get context tokens
            for i in sen_tokens:
                if(i!= word):
                    context_tokens.append(i)
           
            
            #context embedding for the sentence(only context tokens)
            cont_sentence = ""
            for t in context_tokens:
                cont_sentence += t + " "
            cont_emb_sbert = model_sbert.encode(cont_sentence)

              

            #cosine of gloss and context vectors of sentences - first part
            if(len(gloss_arr)== 0 or len(cont_emb_sbert)== 0): #null check
                sim1 = 0
               
            else:
                sim1 = distance.cosine(gloss_vec,cont_emb_sbert)
    
            
        #context vector for the given word same as baseline - for second score
            for i in context_tokens :
               
                #if word already has a predicted synset
                if i in disambiguated:
                    pred = disambiguated[i]
                    
                    #find gloss embedding
                    new_gloss = set(WordPunctTokenizer().tokenize(pred.definition()))
                    for ex in pred.examples():
                        tok = set(word_tokenize(ex))
                        new_gloss = new_gloss.union(tok)
            
                    new_gloss = new_gloss.difference(functionwords)    
                    #find word embedding for gloss
                    for k in new_gloss: 
                        if k in model.key_to_index:
                            cont_emb.append(model[k])
                
                #if word is not disambiguated, find word embedding from word2vec model   
                else:
                    if i in model.key_to_index:
                        cont_emb.append(model[i])
            context_vec = [sum(vals)/len(cont_emb) for vals in zip(*cont_emb)] 
   
        #Get lexeme embedding for word - same as baseline
            
            if(word in sub_dict):
                #lexeme format from sub dict
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]
                format = lemma+"-wn-2.1-"+str(sense.offset()).zfill(8)+"-"+pos
                

                #extract corresponding lexeme embedding from lexeme.txt file
                # and stored as dictionary dict with key and value
                if((format in dict) and (len(cont_emb)!=0)):
                    lexeme_embed = dict[format]

                    #cosine of lexeme embeddings and context vector - second score
                    sim2 = distance.cosine(lexeme_embed,context_vec)
                else:
                    sim2 = 0
            else:
                    sim2 = 0
            # score value
            score = sim1 + sim2 
          
            score_list.append(score)
            
        if(len(wordnet.synsets(word))!=0):
            #Get predicted synset
            
            max_score_index = np.argmax(score_list)
            pred_synset = wordnet.synsets(word)[max_score_index]
            disambiguated[word] = pred_synset
            
           
            if word not in sub_dict and  word+'.' in sub_dict:              
                word=word+'.'
            
            if word in sub_dict:
               #Get true synset
            
                wnsn = sub_dict[word].split(',')[2]
                lemma = sub_dict[word].split(',')[0]
                pos = sub_dict[word].split(',')[1]

                true_syn = lemma+"."+pos+".0"+wnsn
             
                true_synset_arr=true_syn.split(".")
                predicted_synset_arr=pred_synset.name().split(".")

            #If more than 1 true synset exists -> list
                if ';' in true_synset_arr[2]:
                    true_synset_arr_1=true_synset_arr[2].split(";")
                    true_synset_arr_1=[i.zfill(2) for i in true_synset_arr_1]
                else:
                    true_synset_arr_1=true_synset_arr[2]
            
            #Comparison of predicted and true synsets for accuracy calculation
                if (predicted_synset_arr[1]==true_synset_arr[1] and predicted_synset_arr[2] in true_synset_arr_1):
                    correct+=1
                    count+=1
                    
                else:
                    count+=1
                    
                score_list = []
            else:
                score_list = []
        else:
            score_list = []
            
    #Returns the number of matching words and the total number of words considered    
    return correct, count

### Evaluation on datasets

#### Evaluation steps are same as done for baseline. Only the function called differs

In [59]:
#Senseval2 computation - sbert embeddings

correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_senseval2:

    correct,count = fun_lesk_sbert(data_dict_senseval2[i][0],data_dict_senseval2[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using sbert embeddings on senseval2 -",accuracy*100)

Accuracy using sbert embeddings on senseval2 - 35.45263157894737


In [34]:
#Senseval3 computation - sbert embeddings

correct =0
count = 0
correct_total = 0
count_total = 0

for i in data_dict_senseval3:
    correct,count = fun_lesk_sbert(data_dict_senseval3[i][0],data_dict_senseval3[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using sbert embeddings on senseval3 -",accuracy*100)

Accuracy using sbert embeddings on senseval3 - 34.82740855229263


In [38]:
#Semcor computation - sbert embeddings

correct =0
count = 0
correct_total = 0
count_total = 0

#Running for 5000 sentences was taking more than 24 hours, hence I am selecting only 500 sentences
#Evaluation is done only on 500 sentences of Semcor
new_index_list = index_list[0:500]
for i in new_index_list:
    correct,count = fun_lesk_sbert(data_dict_semcor[i][0],data_dict_semcor[i][1])
    count_total += count
    correct_total += correct
    
accuracy = correct_total/count_total
print("Accuracy using sbert embeddings on semcor -",accuracy*100)

Accuracy using sbert embeddings on semcor - 36.19737750172533
