In [1]:
import nltk
nltk.download("all",quiet=True)
from nltk.tree import Tree

In [2]:
from nltk.corpus import semcor #for semcor corpus



In [19]:
from nltk.corpus import wordnet as wn


In [43]:
wn.synsets('primary_election')

[Synset('primary.n.01')]

In [22]:
wn.synset('primary.n.01').definition()

'a preliminary election where delegates or nominees are chosen'

In [31]:
dir(semcor.tagged_sents(tag='sem')[0][10].label())

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_frame_ids',
 '_frame_strings',
 '_hypernyms',
 '_instance_hypernyms',
 '_key',
 '_lang',
 '_lex_id',
 '_lexname_index',
 '_name',
 '_related',
 '_synset',
 '_syntactic_marker',
 '_wordnet_corpus_reader',
 'also_sees',
 'antonyms',
 'attributes',
 'causes',
 'count',
 'derivationally_related_forms',
 'entailments',
 'frame_ids',
 'frame_strings',
 'hypernyms',
 'hyponyms',
 'in_region_domains',
 'in_topic_domains',
 'in_usage_domains',
 'instance_hypernyms',
 'instance_hyponyms',
 'key',
 'lang',
 'member_holonyms',
 'member_meronyms',
 'name',
 'part_holonyms',
 'part_meronyms',
 'pertainyms',
 'region_dom

In [39]:
semcor.tagged_sents(tag='sem')[0][10].label().name()

'primary_election'

In [None]:
also_sees',
 'antonyms',
 'attributes',
 'causes',
 'count',
 'derivationally_related_forms',
 'entailments',
 'frame_ids',
 'frame_strings',
 'hypernyms',
 'hyponyms',
 'in_region_domains',
 'in_topic_domains',
 'in_usage_domains',
 'instance_hypernyms',
 'instance_hyponyms',
 'key',
 'lang',
 'member_holonyms',
 'member_meronyms',
 'name',
 'part_holonyms',
 'part_meronyms',
 'pertainyms',
 'region_domains',
 'similar_tos',
 'substance_holonyms',
 'substance_meronyms',
 'synset',
 'syntactic_marker',
 'topic_domains',
 'usage_domains',
 'verb_groups']

In [23]:
# Task: Tag words with synset ID
# Three types of tags in semcor
# 1) No tag
# 2) Tagged with a lemma of synset: A lemma is a data structure which contains synset along with its synset_id, hypernymy, hyponymy,... 
# 3) Tagged with lemma but the synset ID doesn't exist in wordnet
#    One such example is the 7th sentence in the semcor corpus
#    The grand jury commented on a number of other topics, among them the Atlanta and Fulton County purchasing departments which it said are 
#    well operated and follow generally accepted practices which inure to the best interest of both governments.
#                                       ^______^  "accepted" is tagged as "accepted.s.00" lemma and this doesn't have a synsetID in wordnet
#    All the three cases are handled by the code

# Each sentence has a pair of phrase and (representation of)synset
# Each Phrase has one or more words(e.g. "primary election" in first sentence of corpora)
# Each synset is respresented as a Tree e.g. for "primary election"
# Lemma('primary.n.01.primary_election')
#                 /\
#                /  \
#               /    \
#         primary   election
# This example will be used later to explain code
synCorpus=[]
current_index=0
for taggedSents in semcor.tagged_sents(tag='sem'): # for each sentence fetch the (phrase,synsetTree)
    synCorpus.append([])                           # create a list designating current sentence which will contain (phrase,synsetID)
    for phrase in taggedSents:
        if type(phrase)==list:                     # The phrase with no tag is a list in semcor [Note: CASE 1]
            #if its a list fetch the word and set -1 as synset ID which indicates no synset TAG
            synCorpus[current_index].append((phrase[0],-1))  # a tuple is formed
        else: # if synset is a Tree [Note Case 2 and 3]
            # Here when theres a synset not in wordnet, theres no synset id[Case 3], to handle this
            # except block is used
            # The try except will get the synset ID and in case no synsetID -2 is given to such a case
            try: 
                # handle Case 2
                synTag_for_corpus=phrase.label().synset().offset()
            except:
                # handle Case 3
                synTag_for_corpus=-2 # -2 indicates no synset tags for the specific POS but available for some POS
            for words in phrase:
                # Here words will be "primary" and "election" in form of Tree[Referring to the example at the beginning]
                str=""
                for word in words:
                    str+=" "+word
                # e.g. str= "primary election "
                str.strip() # e.g. str= "primary election"
                synText_for_corpus=str
            synCorpus[current_index].append((synText_for_corpus,synTag_for_corpus)) # For case 2 and 3 form a tuple
    current_index+=1 # Keeps tract of the sentence index
no_sents=current_index #Saved total number of sentences
# The reason for storing as tuples is because the brown corpus stored it in a same way and this way we have to make minimal changed to HMM-Viterbi POS tag code if any :)

In [24]:
synCorpus[0]

[('The', -1),
 (' Fulton County Grand Jury', 31264),
 (' s a i d', 1009240),
 (' F r i d a y', 15164463),
 ('an', -1),
 (' i n v e s t i g a t i o n', 5800611),
 ('of', -1),
 (' A t l a n t a', 9076675),
 ("'s", -1),
 (' r e c e n t', 1730444),
 (' e l e c t i o n', 182571),
 (' p r o d u c e d', 2141146),
 ('``', -1),
 ('no', -1),
 (' e v i d e n c e', 5823932),
 ("''", -1),
 ('that', -1),
 ('any', -1),
 (' i r r e g u l a r i t i e s', 737188),
 (' p l a c e', 339934),
 ('.', -1)]