In [1]:
!pip3 install nltk==3.6.2

Collecting nltk==3.6.2
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.7 MB/s 
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.6.2


In [2]:
import numpy as np
import nltk
from nltk.tree import Tree
from nltk.corpus import semcor
from gensim.models import KeyedVectors
from nltk.corpus import wordnet as wn

# Getting the brown corpus
print("Downloading files from NLTK please wait...")
nltk.download('all', quiet=True)
print("NLTK files downloaded!")


Downloading files from NLTK please wait...
NLTK files downloaded!


In [3]:
dict={}
for sentence in semcor.tagged_sents(tag='pos'):
  for word in sentence:
    dict[word.label()]=1

In [22]:
dict.keys()

dict_keys(['DT', 'NNP', 'VB', 'NN', 'IN', 'POS', 'JJ', None, 'RB', 'WDT', 'CC', 'VBD', 'VBN', 'TO', 'PRP', 'MD', 'VBZ', 'PRP$', 'WRB', 'CD', 'EX', 'VBP', 'WP', 'NNS', 'VBG', 'MD|VB', 'NNPS', 'PDT', 'UH', 'WP$', 'LS', 'FW', 'NPS', 'JJR', 'RBR', 'PP', 'RBS', 'NP', 'RP', 'PR', 'JJS', 'NNP|NP', 'NNP|VBN', 'NN|SYM'])

In [4]:
synCorpus=[]
current_index=0
for taggedSents,postaggedSents in zip(semcor.tagged_sents(tag='sem'), semcor.tagged_sents(tag='pos')): # for each sentence fetch the (phrase,synsetTree)
    synCorpus.append([])
    # synposCorpus.append([])                           # create a list designating current sentence which will contain (phrase,synsetID)
    for phrase,posphrase in zip(taggedSents,postaggedSents):
        if type(phrase)==list:                     # The phrase with no tag is a list in semcor [Note: CASE 1]
            pass
        else: 
            try: 
                # handle Case 2
                synCorpus[current_index].append((phrase.label().name(),phrase.label().synset().name(),posphrase.label())) 
            except:
                # handle Case 3, The tags with no synset id but a synset present
                pass
    current_index+=1 # Keeps track of the sentence index
no_sents=current_index #Saved total number of sentences

In [5]:
print("Loading word2vec")
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
try:
	model_w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',binary=True)
except:
	print("Download pretrained word2vec from https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz and save in the same directory of the file\nExiting...")
	exit()


Loading word2vec
--2021-11-04 09:56:43--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.11.70
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.11.70|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-11-04 09:57:02 (84.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [12]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
print("Testing and getting accuracy estimate")
synCorpus_words=[]
synCorpus_pos=[]
line=0

# Get all the sentences in corpus
for sent in synCorpus:
    synCorpus_words.append([])
    synCorpus_pos.append([])
    for (word,tag,postag) in sent:
        synCorpus_words[line].append(word)
        synCorpus_pos[line].append(postag)
    line+=1

# Find context bag for each sentence, the ambiguous word is ignored when comparing with the ambigous word's sense
length_of_wordvec=len(model_w2v["the"])
context_bag_vector = []
for sent in synCorpus_words:
    context_count=0
    context_list=np.zeros((length_of_wordvec,))
    #for every sentence get the context bag
    single_sentence='_'.join(word for word in sent)
    single_sentence=single_sentence.replace('-','_')
    splits=single_sentence.split('_')
    for word in splits:
        try:
            context_list+=model_w2v[word]
            context_count+=1
        except:
            pass
    if context_count!=0:
        context_bag_vector.append(context_list/context_count)
    else:
        context_bag_vector.append(context_list)


# Find similarity with sense bag for each sense
overlapCorpus=[]
count=0
test=0
test1=0
for sent,posSent in zip(synCorpus_words,synCorpus_pos):
    sentence=""
    pos_tag_sentence=""
    for word in sent:
      sentence+=" "+word
    # for postag_word in posSent:
    #   pos_tag_sentence+=" "+postag_word
    overlapCorpus.append([])
    # sentence_pos_tags=generate_pos_tags(sentence)[0][1:-1]
    curr_word_index=-1
    for word,pos in zip(sent,posSent):
        # print(pos_tag_sentence)
        curr_word_index+=1
        synsets_word=wn.synsets(word)
        if len(synsets_word) ==0:
          overlapCorpus[count].append((word,'NOT IN WORDNET'))
          continue
        else:
          # print(pos)
          if 'NN' in pos:
            if len(wn.synsets(word,pos=wn.NOUN))!=0:
              synsets_word=wn.synsets(word,pos=wn.NOUN)
          elif 'JJ' in pos:
            if len(wn.synsets(word,pos=wn.ADJ))!=0:
              synsets_word=wn.synsets(word,pos=wn.ADJ)
          elif 'RB' in pos:
            if len(wn.synsets(word,pos=wn.ADV))!=0:
              synsets_word=wn.synsets(word,pos=wn.ADV)
          elif 'VB' in pos:
            if len(wn.synsets(word,pos=wn.VERB))!=0:
              synsets_word=wn.synsets(word,pos=wn.VERB)
          else:
            pass
        curr_sim=-1
        try:
          curr_synset=synsets_word[0]
        except:
          pass
        for synset_word in synsets_word: # for each sense of a word
            amb_list=np.zeros((length_of_wordvec,))
            amb_count=0
            synset_gloss=synset_word.definition() # get the word's gloss
            all_eg=synset_word.examples()
            synset_all_eg=""
            for i in range(len(all_eg)):
              if all_eg[i]==' ':
                break
              synset_all_eg+=" "+all_eg[i]
            synset_gloss+=" "+synset_all_eg
            splits=synset_gloss.split()
            for word1 in splits:
                try:
                    if(word1!=word):
                        amb_list+=model_w2v[word1]
                        amb_count+=1
                except:
                    pass
            if amb_count!=0: # for context bag wrt an ambiguous word
                amb_vec=(amb_list/amb_count)
            else:
                amb_vec=amb_list
            cos_similarity=model_w2v.cosine_similarities(context_bag_vector[count].T,[amb_vec.T])[0] # Check cosine similary
            if curr_sim<cos_similarity: # and update the sense to the most similar sense
                curr_sim=cos_similarity
                curr_synset=synset_word
        overlapCorpus[count].append((word,curr_synset.name()))
    count+=1



Testing and getting accuracy estimate


In [13]:
# Test the accuracy of overlap based
total_count=0
correct_count=0
current_sent=0
for sent in synCorpus:
    current_word=0
    for (word,tag,pos) in sent:
        if tag==overlapCorpus[current_sent][current_word][1]:
            correct_count+=1
        total_count+=1
        current_word+=1
    current_sent+=1    
print("Accuracy of WSD with overlapping: ",correct_count*100/total_count,'%')

Accuracy of WSD with overlapping:  48.386407732426704 %
