In [1]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("semcor") 
nltk.download("stopwords")
nltk.download('omw-1.4')

from nltk import word_tokenize
from nltk.corpus import semcor 
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

import warnings
import numpy as np
from tqdm.notebook import tqdm
from string import punctuation
from num2words import num2words
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package semcor to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package semcor is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
semcor_tagged_sents = semcor.tagged_sents(tag='sem')
semcor_untagged_sents = semcor.sents()

In [3]:
import gensim.downloader as api

#online loading from api
from gensim.models import KeyedVectors
path = "./GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(path, binary=True)

#local loading
# word2vec = api.load('word2vec-google-news-300')

In [4]:
EXTRA_SW = [
    "''",
    "'s",
    "``"
]

SW = stopwords.words("english")
SW += [p for p in punctuation]
SW += EXTRA_SW
# if '(' in SW:
#     print(1)

In [5]:
def sentence_vector(sent):
    sent_vec = np.zeros(300)
    for word in sent:
        if word in word2vec:
            sent_vec += word2vec.get_vector(word)
    return sent_vec  

In [6]:
senses = []
for i in range(len(semcor_tagged_sents)):
    for j in range(len(semcor_tagged_sents[i])):
        if isinstance(semcor_tagged_sents[i][j],nltk.Tree):
            try:
                if semcor_tagged_sents[i][j].height() == 3:
                    for tree in semcor_tagged_sents[i][j]:
                        if(tree.label() == 'NE'):
                            senses.append('NE')
                            
                else:
                    sense = semcor_tagged_sents[i][j].label().synset().name()
                    senses.append(sense)
                    
            except:
                if (semcor_tagged_sents[i][j].label()=='NE'):
                    senses.append('NE')
                else:
                    sense = semcor_tagged_sents[i][j].label()
                    senses.append(sense)
        else:
            senses.append('none')
senses.append('notag')
senses = list(set(senses))
senses = {senses[i]:i for i in range(len(senses))}

In [7]:
def graph_for_sentence(sentence,isTest = False):
    updated_sent = sentence
    G = nx.Graph()
    word_senses = []
    list_of_all = []
    sense_vec_dict = {}
    sense_vec_dict['none'] = []
    sense_vec_dict['none'].append(np.zeros(300))
    for word in updated_sent:
        all_senses = wn.synsets(word)
        
        list_senses = []
        if len(all_senses) == 0:
            # print(word,'1')
            all_senses = ['none']
            list_of_all.append('none')
            list_senses.append((len(list_of_all)-1,'none'))
        else:
            # print(word,'2')
            for sense in all_senses:
                list_of_all.append(sense)
                list_senses.append((len(list_of_all)-1,sense))
                sense_vec_dict[sense] = []
                sense_sent = sense.definition().split()
                sense_sent = [w for w in sense_sent if (w not in SW) and (w.isalnum())]
                sense_vec = sentence_vector(sense_sent)
                sense_vec_dict[sense].append(sense_vec)
        word_senses.append(list_senses)

    for i in range(len(word_senses)-1):
        for j in range(len(word_senses[i])):
            for k in range(len(word_senses[i+1])):
                cos_sim  = cosine_similarity(sense_vec_dict[word_senses[i][j][1]][0].reshape(1,-1),sense_vec_dict[word_senses[i+1][k][1]][0].reshape(1,-1))
                G.add_edge(word_senses[i][j][0], word_senses[i+1][k][0],weight = cos_sim[0][0])
    rank_output = dict()
    try:
        rank_output = nx.pagerank(G,max_iter=1000)
    except:
        for i in range(G.number_of_nodes()):
            rank_output[i] = 0

    new_list = []
    for i in range(len(word_senses)):
        max_val = -1
        max_sense = ''
        for j in range(len(word_senses[i])):
            if rank_output[word_senses[i][j][0]] >= max_val:
                max_val = rank_output[word_senses[i][j][0]]
                max_sense = word_senses[i][j][1]
        if str(max_sense) == 'none':
            new_list.append('none')
        else:
            if isTest:
                new_list.append(max_sense)
            else:
                new_list.append(str(max_sense)[8:-2])
        
    return new_list





    


In [8]:
def predict(tagged_sents,untagged_sents):
    actual = []
    pred = []
    count = 0
    for i in range(len(tagged_sents)):
        sentence = []
        # print(i)
        if len(tagged_sents[i])<=1:
            continue
        for j in range(len(tagged_sents[i])):
            
            word = ''

        
            if isinstance(tagged_sents[i][j],nltk.Tree):
                try:
                    if tagged_sents[i][j].height() == 3:
                        for tree in tagged_sents[i][j]:
                            if(tree.label() == 'NE'):
                                word = "_".join(tree.leaves())
                                actual.append('NE')
                                
                    else:
                       
                        word = "_".join(tagged_sents[i][j].leaves())
                        sense = tagged_sents[i][j].label().synset().name()
                        actual.append(sense)

                except:
                    
                    if (tagged_sents[i][j].label()=='NE'):
                        word = "_".join(tagged_sents[i][j].leaves())
                        actual.append('NE')
                    else:
                        sense = tagged_sents[i][j].label()
                        word = "_".join(tagged_sents[i][j].leaves())
                        actual.append(sense)
            else:
               
                actual.append('none')
                word = "_".join(tagged_sents[i][j])
            sentence.append(word)
        if count%500 == 0:
            print("Steps Completed ",count)
        count += 1
        
        pred += graph_for_sentence(sentence)
    return actual,pred

In [9]:
semcor_tagged_sents = semcor.tagged_sents(tag='sem')
semcor_untagged_sents = semcor.sents()
semcor_pos_tagged_sents = semcor.tagged_sents(tag='pos')


In [10]:
actual,pred = predict(semcor_tagged_sents,semcor_tagged_sents)

Steps Completed  0
Steps Completed  500
Steps Completed  1000
Steps Completed  1500
Steps Completed  2000
Steps Completed  2500
Steps Completed  3000
Steps Completed  3500
Steps Completed  4000
Steps Completed  4500
Steps Completed  5000
Steps Completed  5500
Steps Completed  6000
Steps Completed  6500
Steps Completed  7000
Steps Completed  7500
Steps Completed  8000
Steps Completed  8500
Steps Completed  9000
Steps Completed  9500
Steps Completed  10000
Steps Completed  10500
Steps Completed  11000
Steps Completed  11500
Steps Completed  12000
Steps Completed  12500
Steps Completed  13000
Steps Completed  13500
Steps Completed  14000
Steps Completed  14500
Steps Completed  15000
Steps Completed  15500
Steps Completed  16000
Steps Completed  16500
Steps Completed  17000
Steps Completed  17500
Steps Completed  18000
Steps Completed  18500
Steps Completed  19000
Steps Completed  19500
Steps Completed  20000
Steps Completed  20500
Steps Completed  21000
Steps Completed  21500
Steps Comple

In [11]:
print(len(actual))
print(len(pred))

778579
778579


In [12]:

new_actual = []
new_pred = []
k=0
for i in range(len(actual)):
    if actual[i] == 'none' or actual[i] == 'NE':
        continue
    if actual[i].split('.')[1] == 'n':
        new_actual.append(actual[i])
        new_pred.append(pred[i])

In [13]:
print(len(new_actual))
print(len(new_pred))

79313
79313


In [14]:
y_true_n = [senses[i] for i in new_actual]
y_pred_n = [senses.get(i, senses['none']) for i in new_pred]
y_pred = [senses.get(i, senses['none']) for i in pred]
y_true = [senses[i] for i in actual]


In [15]:
import sklearn.metrics as sm
print("Scores for All tagset")
print(sm.accuracy_score(y_true, y_pred))
print(sm.f1_score(y_true, y_pred, average='weighted'))
print(sm.precision_score(y_true, y_pred,average='weighted'))
print(sm.recall_score(y_true, y_pred,average='weighted'))

print("==="*40)
print('Scores for Nouns')
print(sm.accuracy_score(y_true_n, y_pred_n))
print(sm.f1_score(y_true_n, y_pred_n, average='weighted'))
print(sm.precision_score(y_true_n, y_pred_n,average='weighted'))
print(sm.recall_score(y_true_n, y_pred_n,average='weighted'))




Scores for All tagset
0.5717210456485469
0.6067095872539181
0.6914588874335078
0.5717210456485469
Scores for Nouns
0.3186488974064781
0.3540028532501026
0.5837607117961402
0.3186488974064781


In [16]:
def test_sentence(sentence):
    unc_sen = sentence
    sentence = sentence.split()
    print("==="*40)
    print("Sentence: ",unc_sen)
    print("==="*40)
    sense = graph_for_sentence(sentence,True)
    for i in range(len(sentence)):
    
        if str(sense[i]) == 'none' or str(sense[i]) == 'NE':
            print("Word: ",sentence[i])
            print("Sense: ", sense[i])
        else:
            print("Word: ",sentence[i])
            print("Sense: ", str(sense[i])[8:-2])
            print("Definition of sense: ", sense[i].definition())
        print('---'*20)
    print("==="*40)
        

In [17]:
new_sentence = ' He was right about turning right .'
test_sentence(new_sentence)

Sentence:   He was right about turning right .
Word:  He
Sense:  helium.n.01
Definition of sense:  a very light colorless element that is one of the six inert gasses; the most difficult gas to liquefy; occurs in economically extractable amounts in certain natural gases (as those found in Texas and Kansas)
------------------------------------------------------------
Word:  was
Sense:  be.v.03
Definition of sense:  occupy a certain position or area; be somewhere
------------------------------------------------------------
Word:  right
Sense:  right.n.04
Definition of sense:  those who support political or social or economic conservatism; those who believe that things are better left unchanged
------------------------------------------------------------
Word:  about
Sense:  about.r.04
Definition of sense:  used of movement to or among many different places or in no particular direction
------------------------------------------------------------
Word:  turning
Sense:  turn.v.16
Definition