In [13]:

import numpy as np
import itertools
import matplotlib
import nltk
import random
import math
import matplotlib.pyplot as plt
import statistics
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from nltk.tree import Tree
from nltk.corpus import semcor
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [162]:



# Getting the brown corpus
print("Downloading files from NLTK please wait...")
nltk.download('all', quiet=True)
print("NLTK files downloaded!")



# Task: Tag words with synset ID
# Three types of tags in semcor
# 1) No tag
# 2) Tagged with a lemma of synset: A lemma is a data structure which contains synset along with its synset_id, hypernymy, hyponymy,... 
# 3) Tagged with lemma but the synset ID doesn't exist in wordnet
#    One such example is the 7th sentence in the semcor corpus
#    The grand jury commented on a number of other topics, among them the Atlanta and Fulton County purchasing departments which it said are 
#    well operated and follow generally accepted practices which inure to the best interest of both governments.
#                                       ^______^  "accepted" is tagged as "accepted.s.00" lemma and this doesn't have a synsetID in wordnet
#    All the three cases are handled by the code

# Each sentence has a pair of phrase and (representation of)synset
# Each Phrase has one or more words(e.g. "primary election" in first sentence of corpora)
# Each synset is respresented as a Tree e.g. for "primary election"
# Lemma('primary.n.01.primary_election')
#                 /\
#                /  \
#               /    \
#         primary   election
# This example will be used later to explain code
print("Preprocessing the semcor corpus...")
synCorpus=[]
current_index=0
for taggedSents in semcor.tagged_sents(tag='sem'): # for each sentence fetch the (phrase,synsetTree)
    synCorpus.append([])                           # create a list designating current sentence which will contain (phrase,synsetID)
    for phrase in taggedSents:
        if type(phrase)==list:                     # The phrase with no tag is a list in semcor [Note: CASE 1]
            #if its a list fetch the word and set -1 as synset ID which indicates no synset TAG
            # synCorpus[current_index].append((phrase[0],-1))  # a tuple is formed
            pass
        else: # if synset is a Tree [Note Case 2 and 3]
            # Here when theres a synset not in wordnet, theres no synset id[Case 3], to handle this
            # except block is used
            # The try except will get the synset ID and in case no synsetID -2 is given to such a case
            try: 
                # handle Case 2
                if phrase.label().name() == 'strong':
                    print(taggedSents)
                synCorpus[current_index].append((phrase.label().name(),phrase.label().synset().name())) 
            except:
                # handle Case 3
                #synTag_for_corpus=-2 # -2 indicates no synset tags for the specific POS but available for some POS
                pass
    current_index+=1 # Keeps tract of the sentence index
no_sents=current_index #Saved total number of sentences
# The reason for storing as tuples is because the brown corpus stored it in a same way and this way we have to make minimal changed to HMM-Viterbi POS tag code if any :)
print("Semcor preprocessing finished")


Downloading files from NLTK please wait...
NLTK files downloaded!
Preprocessing the semcor corpus...
[Tree(Lemma('georgia.n.01.Georgia'), ['Georgia']), Tree(Lemma('republican.n.01.Republican'), ['Republicans']), ['are'], Tree(Lemma('receive.v.02.get'), ['getting']), Tree(Lemma('potent.s.02.strong'), ['strong']), Tree(Lemma('boost.n.01.encouragement'), ['encouragement']), ['to'], Tree(Lemma('enroll.v.01.enter'), ['enter']), ['a'], Tree(Lemma('campaigner.n.01.candidate'), ['candidate']), ['in'], ['the'], ['1962'], Tree(Lemma('governor's_race.n.01.governor's_race'), ['governor', "'s", 'race']), [','], ['a'], Tree('top.s.00', ['top']), Tree(Lemma('official.n.01.official'), ['official']), Tree(Lemma('state.v.01.say'), ['said']), Tree(Lemma('wednesday.n.01.Wednesday'), ['Wednesday']), ['.']]
[Tree(Lemma('person.n.01.person'), [Tree('NE', ['Whitey', 'Herzog'])]), [','], Tree(Lemma('perform.v.02.perform'), ['performing']), ['in'], Tree(Lemma('right_field.n.01.right'), ['right']), ['as'], ['the

[['This'], Tree(Lemma('kind.n.01.kind'), ['kind']), ['of'], Tree(Lemma('irresponsibility.n.01.irresponsibility'), ['irresponsibility']), ['toward'], ['their'], Tree(Lemma('student.n.01.student'), ['students']), ['can'], Tree(Lemma('barely.r.01.scarcely'), ['scarcely']), Tree(Lemma('build_up.v.02.build'), ['build']), ['a'], Tree(Lemma('strong.a.01.strong'), ['strong']), Tree(Lemma('professional.a.03.professional'), ['professional']), Tree(Lemma('attitude.n.01.attitude'), ['attitude']), ['in'], ['the'], Tree(Lemma('future.s.03.future'), ['future']), Tree(Lemma('interior_designer.n.01.designer'), ['designer']), ['.']]
[['The'], Tree(Lemma('last.a.02.last'), ['last']), Tree(Lemma('two.s.01.two'), ['two']), Tree(Lemma('writer.n.01.writer'), ['writers']), Tree(Lemma('insert.v.01.introduce'), ['introduce']), Tree(Lemma('strong.s.02.strong'), ['strong']), Tree(Lemma('political.a.02.political'), ['political']), Tree(Lemma('bias.n.01.bias'), ['bias']), ['into'], ['their'], Tree(Lemma('work.n.02.

[['We'], Tree(Lemma('comment.v.01.point_out'), ['pointed', 'out']), ['that'], Tree(Lemma('emotional.a.01.emotional'), ['emotional']), Tree(Lemma('excitation.n.03.excitement'), ['excitement']), ['may'], Tree(Lemma('leave.v.07.lead'), ['lead']), ['to'], Tree(Lemma('psychosomatic_disorder.n.01.psychosomatic_disorder'), ['psychosomatic', 'disorders']), ['and'], Tree(Lemma('neurotic.a.01.neurotic'), ['neurotic']), Tree(Lemma('symptom.n.01.symptom'), ['symptoms']), [','], Tree(Lemma('particularly.r.01.particularly'), ['particularly']), ['in'], Tree(Lemma('certain.s.01.certain'), ['certain']), Tree(Lemma('type.n.01.type'), ['types']), ['of'], Tree(Lemma('personality.n.01.personality'), ['personality']), [','], ['but'], ['it'], ['is'], Tree(Lemma('besides.r.02.also'), ['also']), Tree(Lemma('known.a.01.known'), ['known']), ['that'], ['the'], Tree(Lemma('reliving.n.01.reliving'), ['reliving']), ['of'], ['a'], Tree(Lemma('strong.a.01.strong'), ['strong']), Tree(Lemma('emotion.n.01.emotion'), ['em

[['The'], Tree(Lemma('doctor.n.01.doctor'), ['doctors']), ['had'], Tree(Lemma('indicate.v.05.suggest'), ['suggested']), Tree(Lemma('person.n.01.person'), [Tree('NE', ['Scotty'])]), Tree(Lemma('stay.v.01.remain'), ['remain']), Tree(Lemma('about.r.07.most'), ['most']), ['of'], Tree('every.s.02', ['every']), Tree(Lemma('afternoon.n.01.afternoon'), ['afternoon']), ['in'], Tree(Lemma('bed.n.01.bed'), ['bed']), ['until'], ['he'], Tree(Lemma('be.v.01.be'), ['was']), Tree(Lemma('strong.a.01.strong'), ['stronger']), ['.']]
[['He'], ["'ll"], Tree(Lemma('be.v.01.be'), ['be']), Tree(Lemma('strong.a.01.strong'), ['stronger']), Tree(Lemma('soon.r.01.soon'), ['soon']), ["''"], ['.']]
[['He'], Tree(Lemma('state.v.01.say'), ['said']), Tree(Lemma('fussily.r.01.fussily'), ['fussily']), [','], ['``'], Tree(Lemma('merely.r.01.just'), ['Just']), Tree(Lemma('keep.v.01.keep'), ['keep']), ['the'], Tree(Lemma('cap.n.02.cap'), ['cap']), ['on'], ['those'], Tree(Lemma('strong.s.02.strong'), ['strong']), Tree(Lemma

[['He'], Tree(Lemma('feel.v.06.feel'), ['felt']), ['a'], Tree(Lemma('puppyish.s.01.puppyish'), ['puppyish']), Tree(Lemma('motivation.n.01.need'), ['need']), ['for'], Tree(Lemma('company.n.03.company'), ['company']), ['as'], Tree(Lemma('strong.a.01.strong'), ['strong']), ['as'], ['his'], Tree('earlier.s.00', ['earlier']), Tree(Lemma('necessity.n.01.necessity'), ['necessity']), ['for'], Tree(Lemma('lull.n.02.quiet'), ['quiet']), ['.']]
[Tree(Lemma('again.r.01.again'), ['Again']), [','], ['a'], Tree(Lemma('force.n.02.force'), ['force']), Tree(Lemma('excessively.r.01.too'), ['too']), Tree(Lemma('strong.a.01.strong'), ['strong']), ['for'], Tree(Lemma('unfrozen.a.01.unfrozen'), ['unfrozen']), Tree(Lemma('body.n.01.body'), ['bodies']), ['to'], Tree(Lemma('weather.v.01.endure'), ['endure']), ['would'], ['be'], Tree(Lemma('use.v.01.apply'), ['applied']), ['.']]
[['That'], ['he'], Tree(Lemma('master.v.01.master'), ['mastered']), ['every'], Tree(Lemma('aspect.n.01.aspect'), ['aspect']), ['of'], [

[Tree(Lemma('then.r.03.then'), ['Then']), Tree(Lemma('suddenly.r.01.suddenly'), ['suddenly']), ['we'], Tree(Lemma('find.v.13.find'), ['found']), ['ourselves'], ['in'], ['the'], Tree(Lemma('center.n.01.middle'), ['middle']), ['of'], ['another'], Tree(Lemma('fight.n.02.fight'), ['fight']), [','], ['an'], Tree(Lemma('irrational.a.01.irrational'), ['irrational']), [','], ['an'], Tree(Lemma('indecent.s.01.indecent'), ['indecent']), [','], ['an'], Tree(Lemma('undeclared.a.01.undeclared'), ['undeclared']), ['and'], Tree(Lemma('base.s.04.immoral'), ['immoral']), Tree(Lemma('war.n.01.war'), ['war']), ['with'], ['our'], Tree(Lemma('potent.s.02.strong'), ['strongest']), ['('], ['and'], ['some'], ['had'], Tree(Lemma('think.v.01.think'), ['thought']), Tree('noble.s.00', ['noblest']), [')'], Tree(Lemma('ally.n.01.ally'), ['ally']), ['.']]
[Tree(Lemma('woman.n.01.woman'), ['Women']), ['themselves'], ['have'], Tree(Lemma('come.v.04.come'), ['come']), ['to'], Tree(Lemma('think_of.v.03.look_upon'), ['lo

[['He'], ['could'], Tree('move.v.3;1', ['move']), Tree(Lemma('very.r.01.very'), ['very']), Tree(Lemma('quickly.r.01.quickly'), ['quickly']), [','], ['she'], Tree(Lemma('know.v.01.know'), ['knew']), ['('], ['although'], ['he'], Tree(Lemma('rarely.r.01.seldom'), ['seldom']), Tree(Lemma('find.v.01.find'), ['found']), Tree(Lemma('occasion.n.03.occasion'), ['occasion']), ['to'], Tree(Lemma('perform.v.01.do'), ['do']), Tree(Lemma('so.r.03.so'), ['so']), [')'], [','], ['but'], ['he'], Tree(Lemma('be.v.01.be'), ['was']), Tree(Lemma('more.r.01.more'), ['more']), Tree(Lemma('stringy.s.01.wiry'), ['wiry']), ['than'], Tree(Lemma('truly.r.01.truly'), ['truly']), Tree(Lemma('strong.a.01.strong'), ['strong']), ['.']]
[['He'], Tree(Lemma('immediately.r.01.immediately'), ['immediately']), Tree(Lemma('ride.v.01.ride'), ['rode']), Tree(Lemma('along.r.01.on'), ['on']), ['to'], Tree(Lemma('cheyenne.n.01.Cheyenne'), ['Cheyenne']), [','], Tree(Lemma('hold.v.03.throw'), ['threw']), ['a'], Tree(Lemma('ten.s.01

In [4]:
stop_words = stopwords.words('english')

In [8]:
!wget -c \https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\

--2021-09-07 08:42:10--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.85.253
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.85.253|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [10]:
model_w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz',
binary=True)

In [107]:
model_w2v["blue"]

array([ 0.0390625 ,  0.08642578,  0.22363281,  0.11865234, -0.08642578,
       -0.25      ,  0.00558472, -0.24511719,  0.05981445,  0.03027344,
        0.11083984,  0.10009766, -0.00842285, -0.02404785, -0.328125  ,
       -0.12597656, -0.20507812,  0.28710938, -0.22265625, -0.24902344,
       -0.05639648,  0.16796875, -0.11572266, -0.39453125, -0.15332031,
        0.07568359, -0.10986328,  0.09423828,  0.14941406, -0.13378906,
        0.0703125 ,  0.11181641,  0.16992188, -0.02868652,  0.07128906,
       -0.03955078,  0.125     ,  0.09765625, -0.20605469,  0.05737305,
        0.16894531, -0.0625    , -0.02050781, -0.21289062,  0.23242188,
        0.02099609, -0.06591797, -0.14941406,  0.10986328,  0.14160156,
       -0.10791016,  0.171875  ,  0.14941406,  0.02685547,  0.10302734,
       -0.05224609, -0.02392578, -0.09716797, -0.11962891, -0.12597656,
       -0.28710938,  0.11914062, -0.03881836, -0.01000977,  0.11962891,
        0.046875  , -0.31445312,  0.13085938, -0.0025177 ,  0.25

In [None]:
text_tokens = word_tokenize(text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]


In [None]:
gloss = word_tokenize(wn.synset('state.v.01').definition())
        words_in_gloss = [word for word in gloss if not word in stopwords.words()]
        word_embeddings=[]
        for word in words_in_gloss:
            word_embeddings.model_w2v[word]
        word_embedding_con
        for word_context,tag_context in sent:
            if(tag_amb!=tag_context):
                pass

In [61]:
len([])

0

In [None]:
s

In [58]:
np.sum(model_w2v[word_context.split('_')],axis=0).shape

(300,)

In [67]:
synCorpus_words=[]
line=0
for sent in synCorpus:
    synCorpus_words.append([])
    for (word,tag) in sent:
        synCorpus_words[line].append(word)
    line+=1

[['group',
  'say',
  'Friday',
  'investigation',
  'Atlanta',
  'recent',
  'primary_election',
  'produce',
  'evidence',
  'irregularity',
  'take_place'],
 ['jury',
  'far',
  'say',
  'term',
  'end',
  'presentment',
  'group',
  'have',
  'overall',
  'charge',
  'election',
  'deserve',
  'praise',
  'thanks',
  'location',
  'manner',
  'election',
  'conduct'],
 ['September',
  'October',
  'term',
  'jury',
  'charge',
  'location',
  'person',
  'investigate',
  'report',
  'possible',
  'irregularity',
  'hard-fought',
  'primary',
  'win',
  'person'],
 ['only',
  'relative',
  'handful',
  'report',
  'receive',
  'jury',
  'say',
  'consider',
  'widespread',
  'interest',
  'election',
  'number',
  'voter',
  'size',
  'city'],
 ['jury',
  'say',
  'find',
  'Georgia',
  'registration',
  'election',
  'law',
  'be',
  'outmoded',
  'inadequate',
  'often',
  'ambiguous'],
 ['recommend',
  'location',
  'legislator',
  'act',
  'have',
  'law',
  'study',
  'revise',

In [332]:
length_of_wordvec=len(model_w2v["the"])
context_bag_vector = []
for sent in synCorpus_words:
    context_count=0
    context_list=np.zeros((length_of_wordvec,))
    #for every sentence get the context bag
    single_sentence='_'.join(word for word in sent)
    single_sentence=single_sentence.replace('-','_')
    splits=single_sentence.split('_')
    for word in splits:
        try:
            context_list+=model_w2v[word]
            context_count+=1
        except:
            print(word)
    if context_count!=0:
        context_bag_vector.append(context_list/context_count)
    else:
        context_bag_vector.append(context_list)
    #words_in_sentence=[word for word in splits if not word in stopwords.words()]


of
and
of
governor's
100
30
and
50
10
29
13th
a
17
13
of
1000
12
22
of
24
12
30
24
of
of
of
of
13
10
27
22
22
to
15
a
a
22
of
a
27
26
14
15
13
26
20
10
13
of
10
12
16
40
30
22
22
30
and
22
a
30
19
12
17
a
of
26
12
15
27
25th
12
a
40
of
40
beardown
25
25
30
19th
15
and
20
15
a
of
12
of
of
of
11
of
19
16
of
a
12
of

a
25
a
plunker
10
10
a
a
10
24
10
60
10
lustre
and
of
malposed


catalogue
to
catalogue
a
a
bull's
and
bull's
to
frowningly

of
15
52
20
a
30
50
of
and
100
of
of
50
18
60
500
and
of
of
to
of
to
23
20th
of
a
of
of
of
addlebrained
errantry
of
a
26
a
a
of
a
of
operagoer
12
microphoning
12
a
40
i.e.
100
to
10
to
a
of
and

to
to
unstilted
a
of
a
a
and
of
presentness
to
of
Klux
givenness
of
demythologization
expurgation
conceptuality
kerygma
and
conceptuality
demythologized
demythologization
to
a
demythologized
kerygma
kerygma
of
of
to
to
to
demythologization
demythologization
demythologization
a
demythologization
of
18
25
19th
Tractarian
of
27
of
of
20
to
nonchurchgoing
50
of
19th

of
of
reprovingly
of
a
of

and
incertain

nighted





bewilderedly


of
of
of
mobcap
and
a
and
of
and
a
to
of
a

a
of
of
a
escritoire
of
uncousinly
of
and
escritoire
whisky
whisky
a
a
of









axe
of
to
whisky
whisky


whisky
axe
axe
whisky

whisky

whisky

whisky
axe
and
whisky


and
and
Year's
of
a
to
a
of
a
uncombable
a
to
of
of
of
a
and
to
a

of

a
of
a
a
to
and
patronne
revery
lappet
patronne
patronne
enquire
patronne
sforzando
a
and
cackly
and

to
a
a
clothesbrush

of
to
a

a
catalogue
unsloped
epistolatory
enrol
schoolgirlish
Year's
of
of
Year's
traveller
reveller
and
Year's
a
Year's
a
of
a
a
a
to
a

of
of
a
and
and

to
a
and
worshipper

to
to
a
of
to


of
of

discorporate
to
and
and

to



a

matsyendra
of
to


to
and
of
of
of
a

of
earthman
earthman
of
quirt
to



quirt
to

and
quirt
quirt
a
of
quirt
quirt
a
and

quirt
of
of
a
to

of
a
signore
to
to


and

to
of
soutane
a
amphibology
parisology
gagwriter
misrelated
headlinese
entendre
to
a
of
of
indefinity
entendre
of
a
of






to
of
of




harshen

















of


to


to







to










to










of




to

woolgather







of

of


In [340]:
overlapCorpus=[]
count=0
test=0
test1=0
for sent in synCorpus_words:
    overlapCorpus.append([])
    for word in sent:
        synsets_word=wn.synsets(word)
#         amb_list=np.zeros((length_of_wordvec,))
#         amb_count=0
        curr_sim=-1
        curr_synset=synsets_word[0]
        for synset_word in synsets_word:
            amb_list=np.zeros((length_of_wordvec,))
            amb_count=0
            synset_gloss=synset_word.definition()
            ########TEST
#             synset_examples=synset_word.examples()
#             synset_examples_combined='_'.join(ex for ex in synset_examples)
#             synset_gloss='_'.join(ex for ex in synset_examples_combined+synset_def)
            
            ###########TEST
            splits=synset_gloss.split()
#             print(splits)
            for word1 in splits:
                try:
                    if(word1!=word):
                        amb_list+=model_w2v[word1]
                        amb_count+=1
                except:
                    pass
            if amb_count!=0:
                amb_vec=(amb_list/amb_count)
            else:
                amb_vec=amb_list
            cos_similarity=model_w2v.cosine_similarities(context_bag_vector[count].T,[amb_vec.T])[0]
            if curr_sim<cos_similarity:
                curr_sim=cos_similarity
                curr_synset=synset_word
        overlapCorpus[count].append((word,curr_synset.name()))
    count+=1

In [346]:
cat=model_w2v["lungs"]
dog=model_w2v["physics"]
model_w2v.cosine_similarities(cat,[dog])[0]

0.0892538

In [333]:
overlapCorpus

[[('group', 'group.n.01'),
  ('say', 'say.n.01'),
  ('Friday', 'friday.n.01'),
  ('investigation', 'probe.n.01'),
  ('Atlanta', 'atlanta.n.01'),
  ('recent', 'holocene.n.01'),
  ('primary_election', 'primary.n.01'),
  ('produce', 'produce.n.01'),
  ('evidence', 'evidence.n.01'),
  ('irregularity', 'abnormality.n.04'),
  ('take_place', 'happen.v.01')],
 [('jury', 'jury.n.01'),
  ('far', 'army_for_the_liberation_of_rwanda.n.01'),
  ('say', 'say.n.01'),
  ('term', 'term.n.01'),
  ('end', 'end.n.01'),
  ('presentment', 'presentment.n.01'),
  ('group', 'group.n.01'),
  ('have', 'rich_person.n.01'),
  ('overall', 'overall.n.01'),
  ('charge', 'charge.n.01'),
  ('election', 'election.n.01'),
  ('deserve', 'deserve.v.01'),
  ('praise', 'praise.n.01'),
  ('thanks', 'thanks.n.01'),
  ('location', 'location.n.01'),
  ('manner', 'manner.n.01'),
  ('election', 'election.n.01'),
  ('conduct', 'behavior.n.01')],
 [('September', 'september.n.01'),
  ('October', 'october.n.01'),
  ('term', 'term.n.01')

In [341]:
total_count=0
correct_count=0
current_sent=0
for sent in synCorpus:
    current_word=0
    for (word,tag) in sent:
        if tag==overlapCorpus[current_sent][current_word][1]:
            correct_count+=1
#         else:
#              print("Sentenceno: ",current_sent," Word: ",word," Actual Tag:",tag," Predicted Tag:", overlapCorpus[current_sent][current_word][1])
        total_count+=1
        current_word+=1
    current_sent+=1    
print("Overlapping Accuracy: ",correct_count*100/total_count,'%')

Overlapping Accuracy:  34.609462610584025 %


In [342]:
total_count=0
correct_count=0
current_sent=0
for sent in synCorpus:
    current_word=0
    for (word,tag) in sent:
        if tag==overlapCorpus[current_sent][current_word][1]:
            correct_count+=1
#         else:
#             print("Sentenceno: ",current_sent," Word: ",word," Actual Tag:",tag," Predicted Tag:", overlapCorpus[current_sent][current_word][1])
        total_count+=1
        current_word+=1
    current_sent+=1    
print("Overlapping Accuracy: ",correct_count*100/total_count,'%')

Overlapping Accuracy:  34.609462610584025 %


In [294]:
semcor.tagged_sents(tag='sem')[2438]

[['If'],
 [','],
 ['as'],
 ['a'],
 Tree(Lemma('home_movie.n.01.home_movie'), ['home', 'movie']),
 Tree(Lemma('maker.n.01.maker'), ['maker']),
 [','],
 ['you'],
 Tree(Lemma('film.v.01.shoot'), ['shoot']),
 ['the'],
 Tree(Lemma('inevitable.s.02.inevitable'), ['inevitable']),
 Tree(Lemma('footage.n.01.footage'), ['footage']),
 ['of'],
 ['your'],
 Tree(Lemma('child.n.02.child'), ['child']),
 Tree(Lemma('take.v.01.take'), ['taking']),
 ['its'],
 Tree(Lemma('first.a.01.first'), ['first']),
 Tree(Lemma('steps.n.02.steps'), ['steps']),
 [','],
 ['you'],
 ['have'],
 Tree(Lemma('merely.r.01.merely'), ['merely']),
 Tree(Lemma('record.v.01.record'), ['recorded']),
 ['an'],
 Tree(Lemma('historic.s.01.historical'), ['historical']),
 Tree(Lemma('event.n.01.event'), ['event']),
 ['.']]

In [131]:
        
#             try:
#                 amb_list+=model_w2v[splits]
#                 amb_count+=1
#             except:
#                 pass
#             if amb_count!=0:
#                 amb_vec=(amb_list/amb_count)
#             else:
#                 amv_vec=amb_list
#                 test+=1
#             print(model_w2v.cosine_similarities(context_bag_vector[0],[amb_list/amb_count]))

'order.v.01'

In [328]:
wn.synset('record.v.01').definition()

'make a record of; set down in permanent form'

In [64]:
length_of_wordvec=len(model_w2v["the"])
predictions = []
for sent in synCorpus:
    context_list=np.zeros((length_of_wordvec,))
    for word_context,tag_context in sent:
        word_without_underscore=word_context.split('_')
        word_without_underscore_stopwords=[word for word in word_without_underscore if not word in stopwords.words()]
        try:
            context_list+=np.sum(model_w2v[word_without_underscore_stopwords],axis=0)
        except:
            pass
    context_list/=length_of_wordvec
    

KeyboardInterrupt: 

In [None]:
for word_amb,tag_amb in sent:
        senses_list_amb=wn.synsets(word_amb)
        for sense in senses_list:
            sense.definition()
                

In [None]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

In [6]:
predictions = []

for sent in corpus:
	test_sentence = []
	sentence_predictions = []
	
	for wd in sent:
		test_sentence.append(wd[0])
	
	for wd in sent:
		tag_pred_sim = dict()
		# test_word = 'play'
		# test_tag = 'play.v.01'
		test_word = wd[0]
		test_tag = wd[1]

		test_sentence_set = {word for word in test_sentence if not word in stopwords}
		test_sentence_set = test_sentence_set - set([wd])

		test_pred_similarity = []
		for item in test_sentence_set:
			v_wd = word_vectors[wd]
			v_item = word_vectors[item]
			cos_sim = cosine_similarity([v_wd],[v_item])
			test_pred_similarity.append(cos_sim)
		test_max_similarity = max(test_pred_similarity)

		tag_pred_sim.update({test_tag:test_max_similarity})

		test_sentence_set = {word for word in test_sentence if not word in stopwords}
		test_sentence_set = test_sentence_set - set([wd])

		word_synsets = wordnet.synsets(test_word)

		tag_list = []
		definition_list = []

		for i in range(len(word_synsets)):
			tag_list.append(wordnet.synsets(test_word)[i].name())

		if len(tag_list) == 1:
			sentence_predictions.append((test_word,test_tag))
		else:
			for i in range(len(tag_list)):
				definition_list.append(wordnet.synset(tag_list[i]).definition())
				current_definition = word_tokenize(definition)
				current_definition_set = {word for word in current_definition if not word in stopwords}
				temp_similarity = []
				for item in current_definition_set:
					v_wd = word_vectors[wd]
					v_item = word_vectors[item]
					cos_sim = cosine_similarity([v_wd],[v_item])
					temp_similarity.append(cos_sim)
				temp_max_similarity = max(temp_similarity) 
				if test_tag == tag_list[i]:
					if tag_pred_sim[test_tag] < temp_max_similarity:
						tag_pred_sim.update({test_tag:temp_max_similarity})
				else:
					tag_pred_sim.update({tag_list[i]:temp_max_similarity})
		keymax = max(zip(tag_pred_sim.values(), tag_pred_sim.keys()))[1]
		sentence_predictions.append((test_word,keymax))

	predictions.append(sentence_predictions)

TypeError: argument of type 'WordListCorpusReader' is not iterable