<a href="https://colab.research.google.com/github/prayas99/Word-Sense-Disambiguation-Python/blob/main/Word_Sense_Disambiguation_OverlapBased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download pre-trained word2vec model and enter path of downloaded file

Download pre-trained word2vec using : </br>
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"


In [None]:
your_path_to_saved_model = 'your_parent_directory/GoogleNews-vectors-negative300.bin.gz'

# Pre-trained word2vec model

In [None]:
# Uncomment mount drive code lines if you want to mount your drive to load 
# pre-trained word2vec model

from google.colab import drive
drive.mount('/content/drive')

# Replace your_path_to_saved_model to your path to GoogleNews-vectors-negative300.bin.gz
# Below commented line shows our use of drive for loading the pre-trained model

from gensim.models import KeyedVectors



model_w2v = KeyedVectors.load_word2vec_format(your_path_to_saved_model,
binary=True)
# model_w2v = KeyedVectors.load_word2vec_format(',
# binary=True)

Mounted at /content/drive


# Importing libraries

In [None]:
import time
import numpy as np
from sklearn.metrics import confusion_matrix ,classification_report,accuracy_score,f1_score,precision_score,recall_score
from sklearn.metrics.pairwise import cosine_similarity

# NLTK libraries

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('semcor')
! unzip -o /root/nltk_data/corpora/semcor.zip -d /root/nltk_data/corpora

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package semcor to /root/nltk_data...
Archive:  /root/nltk_data/corpora/semcor.zip
   creating: /root/nltk_data/corpora/semcor/
   creating: /root/nltk_data/corpora/semcor/brown1/
   creating: /root/nltk_data/corpora/semcor/brown1/tagfiles/
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-a01.xml  
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-a02.xml  
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-a11.xml  
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-a12.xml  
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-a13.xml  
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-a14.xml  
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-a15.xml  
  inflating: /root/nltk_data/corpora/semcor/brown1/tagfiles/br-b13.xml  
  inflating: /root/nltk_data/corpora/semcor/b

True

In [None]:
from nltk.corpus import stopwords
from nltk.corpus import semcor
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet as wn

In [None]:
tagged_sentences=semcor.tagged_sents(tag='sem')
dataset=tagged_sentences

# Required functions

In [None]:
class Word2VecTransformer(object):
    def __init__(self,word2vec):
        self.word2vec=word2vec
        
    def transform(self,X):
        return np.array([
            np.mean([self.word2vec[token.lower()] for token in sent if token.lower() in self.word2vec.index2word and token.lower() not in stopwords.words('english')]
            or [np.zeros(self.word2vec.vector_size)],axis=0)
            for sent in X
            ])    
        
def POS_tags(original_tag):
    if original_tag.startswith('J'):
        return wn.ADJ
    elif original_tag.startswith('V'):
        return wn.VERB
    elif original_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

def ContextBag(sentence,i,c):\
  # building contect bag according to required context window
  before=[]
  index=i
  move=int(c/2)
  # c = -1 will denote whole remaining ssentence as context
  while index>0 and (move>0 or c==-1):
    index-=1
    move-=1
    before=[sentence[index]]+before
  # capturing word token after target word for context addition
  after=[]
  index=i
  move=int(c/2)
  while index<len(sentence)-1 and (move>0 or c==-1):
    index+=1
    move-=1
    after=after+[sentence[index]]
  # returning both windows' conext words
  return before+after

def obtain_lemma(lemmatizer,text):
    word_pos_lemma=[]
    for w,p in nltk.pos_tag(nltk.word_tokenize(text)):
        if p!=None:
            word_pos_lemma.append(lemmatizer.lemmatize(w, pos=POS_tags(p)).lower())
        else:
            word_pos_lemma.append(lemmatizer.lemmatize(w).lower())
    return word_pos_lemma

def overlap_model(model,dataset,context_size=-1,n_best=1):
  now = time.time()
  data_len = len(dataset)
  true_labels = []
  pred_labels = []
  done_sent = 0
  total_words = 0
  for sentence in dataset:
    lemmatizer = WordNetLemmatizer()
    context_words = []
    synset_list = []
    for l in sentence:
      if isinstance(l,nltk.tree.Tree):
        if type(l.label())==type('ank'):
          synset=l.label()
          word=l[0]
          context_words.append(word)
          synset_list.append(synset)
        else:
          synset=l.label().synset().name()
          word=l.label().name()
          context_words.append(word)
          synset_list.append(synset)
      else:
          word=l
          context_words.append(word)
    sentence = ' '.join(context_words)
    W = [w.lower() for w in nltk.word_tokenize(sentence)]
    W = nltk.pos_tag(W)
    W = [[lemmatizer.lemmatize(w[0],pos=POS_tags(w[1])),w[1]] for w in W]
    #print(W)
    for i,pair in enumerate(W):
          target_word = pair[0]
          total_words+=1
          C=ContextBag(W,i,context_size)
          C_list=[word[0] for word in C]
          V_C=Word2VecTransformer(model).transform([C_list])
          result=[]
          for synset in wn.synsets(W[i][0],POS_tags(W[i][1])):
            score=0
            Sig=obtain_lemma(lemmatizer,synset.definition())
            V_Sig=Word2VecTransformer(model).transform([Sig])
            #print(np.linalg.norm(V_Sig), np.linalg.norm(V_C))
            score=float(cosine_similarity(V_C,V_Sig).flatten())
            result.append({"synset":synset.name(),"score":score,"definition":synset.definition()})
          result=sorted(result,key=lambda k:k['score'],reverse=True)
          #print(result)
          if (len(result)>0):
              if len(synset_list)>0:
                true_labels.append(synset_list[i])
                pred_labels.append('Synset_id: {} | Gloss : {}'.format(result[0]["synset"], result[0]["definition"]))
              else:
                pred_labels.append('Synset_id: {} | Gloss : {}'.format(result[0]["synset"], result[0]["definition"]))
              #print(synset_list[i], result[0]["synset"])
    done_sent+=1
    if ((done_sent / data_len)*100 % 10) == 0:
      later = time.time()
      print('{} % Done | Time Elapsed : {} sec'.format(((done_sent / data_len)*100), int(later-now)))
  if len(synset_list)>0:
    return true_labels, pred_labels, total_words  
  else:
    return pred_labels, total_words 

# Testing

In [None]:
test_dataset = semcor.tagged_sents(tag='sem')[:10]

In [None]:
true_labels, pred_labels, total_words = overlap_model(model_w2v,test_dataset,n_best=1)

10.0 % Done | Time Elapsed : 9 sec
20.0 % Done | Time Elapsed : 30 sec
30.0 % Done | Time Elapsed : 45 sec
40.0 % Done | Time Elapsed : 59 sec
50.0 % Done | Time Elapsed : 74 sec
60.0 % Done | Time Elapsed : 86 sec
70.0 % Done | Time Elapsed : 114 sec
80.0 % Done | Time Elapsed : 126 sec
90.0 % Done | Time Elapsed : 135 sec
100.0 % Done | Time Elapsed : 146 sec


# Results

In [None]:
# Total words tested
total_words

2336

In [None]:
# Accuracy/Precision achieved
acc = accuracy_score(np.array(true_labels),np.array(pred_labels))
acc

0.35625

In [None]:
# Correct labels
correct = acc*(len(true_labels))

In [None]:
# Labels returned
len(pred_labels)

2080

In [None]:
# Recall score
rec_sc = correct/total_words

In [None]:
# f1 score
2*rec_sc*acc / (rec_sc + acc)

0.3355978260869565

In [None]:
# Precision recall per sense
print(classification_report(np.array(true_labels),np.array(pred_labels),digits=4))

                            precision    recall  f1-score   support

          abnormality.n.04     0.0000    0.0000    0.0000         2
              abridge.v.01     0.0000    0.0000    0.0000         0
               accept.v.02     0.0000    0.0000    0.0000         0
             accepted.s.00     0.0000    0.0000    0.0000         1
              achieve.v.01     1.0000    1.0000    1.0000         1
                  act.n.01     0.0000    0.0000    0.0000         0
                  act.v.01     0.0000    0.0000    0.0000         1
       administration.n.01     0.0000    0.0000    0.0000         1
       administration.n.02     0.0000    0.0000    0.0000         0
                adopt.v.01     0.0000    0.0000    0.0000         1
               agency.n.01     0.0000    0.0000    0.0000         1
               allege.v.01     0.0000    0.0000    0.0000         4
            ambiguous.a.02     0.0000    0.0000    0.0000         1
              analyze.v.01     0.0000    0.0000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Testing on given sents

In [None]:
test_sents = []
test_sents.append('I went to the bank to withdraw some money.')
test_sents.append('I went to the bank to have a bath in the river.')
test_sents.append('The school is seeing return of students')
test_sents.append('The school of fish is swimming past the island.')

In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
tokenized_sents = [word_tokenize(i) for i in test_sents]

for sent in tokenized_sents:
  print(' '.join(sent))
  pred_labels, total_words = overlap_model(model_w2v,[sent],n_best=1)  
  print(*pred_labels,sep='\n')
  print()

I went to the bank to withdraw some money .
100.0 % Done | Time Elapsed : 15 sec
Synset_id: one.s.01 | Gloss : used of a single unit or thing; not two or more
Synset_id: travel.v.01 | Gloss : change location; move, travel, or proceed, also metaphorically
Synset_id: depository_financial_institution.n.01 | Gloss : a financial institution that accepts deposits and channels the money into lending activities
Synset_id: swallow.v.05 | Gloss : take back what one has said
Synset_id: money.n.03 | Gloss : the official currency issued by a government or national bank

I went to the bank to have a bath in the river .
100.0 % Done | Time Elapsed : 28 sec
Synset_id: one.s.01 | Gloss : used of a single unit or thing; not two or more
Synset_id: rifle.v.02 | Gloss : go through in search of something; search through someone's belongings in an unauthorized way
Synset_id: bank.n.01 | Gloss : sloping land (especially the slope beside a body of water)
Synset_id: receive.v.01 | Gloss : get something; come in

# User input

In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
given_sent = [str(input())]

I went to a bank which is situated on the river bank to withdraw money


In [None]:
tokenized_sents = [word_tokenize(i) for i in given_sent]

for sent in tokenized_sents:
  print(' '.join(sent))
  pred_labels, total_words = overlap_model(model_w2v,[sent],n_best=1)
  print(*pred_labels,sep='\n')

I went to a bank which is situated on the river bank to withdraw money
100.0 % Done | Time Elapsed : 28 sec
Synset_id: one.s.01 | Gloss : used of a single unit or thing; not two or more
Synset_id: travel.v.01 | Gloss : change location; move, travel, or proceed, also metaphorically
Synset_id: ampere.n.02 | Gloss : the basic unit of electric current adopted under the Systeme International d'Unites
Synset_id: depository_financial_institution.n.01 | Gloss : a financial institution that accepts deposits and channels the money into lending activities
Synset_id: be.v.05 | Gloss : happen, occur, take place; this was during the visit to my parents' house"
Synset_id: situate.v.02 | Gloss : put (something somewhere) firmly
Synset_id: river.n.01 | Gloss : a large natural stream of water (larger than a creek)
Synset_id: depository_financial_institution.n.01 | Gloss : a financial institution that accepts deposits and channels the money into lending activities
Synset_id: swallow.v.05 | Gloss : take b