In [None]:
# Python dictionaries data type used for mapping between two items(arbitrary types of information)
# called as assocative array, hashmap, map in other programming languages
# An important property of lists is that we can “look up” a particular item by giving its index
# In NLP, look up using word is common, word: pos, word: no. of occurency
# Dictionary lookup: access the entry of a dictionary using a key, Dic[key] = value

In [5]:
# define an empty dictionary add entries (specifying the part-of-speech of some words.)
pos = {}
pos['colourless'] = 'ADJ'
pos['ideas'] = 'N'
pos['sleep'] = 'V'
print('pos:', pos)
print('KeyError: when try to access key which is not in dict.')
pos['new']

pos: {'colourless': 'ADJ', 'ideas': 'N', 'sleep': 'V'}
KeyError: when try to access key which is not in dict.


KeyError: 'new'

In [7]:
# len() won't work for dictionary, how to check legal keys for dictionary?
# get the dictionary key list.
pos_keylist = list(pos)     
print('pos_keylist: {}'.format(pos_keylist))

# dictionaly methods: pos.keys(), pos.values(), pos.items()
print('pos keys: {}'.format(pos.keys()))
print('pos values: {}'.format(pos.values()))
print('pos items: {}'.format(pos.items()))

pos_list: ['colourless', 'ideas', 'sleep']
pos keys: dict_keys(['colourless', 'ideas', 'sleep'])
pos values: dict_values(['ADJ', 'N', 'V'])
pos items: dict_items([('colourless', 'ADJ'), ('ideas', 'N'), ('sleep', 'V')])


In [16]:
# if more than one value need to be mapped then add as list
pos['sleep'] = ['V', 'N']
print('change in sleep value, pos: {}'.format(pos))

#Define dictionary
pos1 = {'furiously': 'ADV', 'work': 'V'}
pos2 = dict(furiously='ADV', work= 'V')
print('pos1 = {}'.format(pos1))
print('pos2 = {}'.format(pos2))

# Dictionary keys should be immutable, TypeError will be thrown if we try to define a dictionary using a mutable key
pos3 = dict()
pos3[[1, 2]] = ['one', 'two']

change in sleep value, pos: {'colourless': 'ADJ', 'ideas': 'N', 'sleep': ['V', 'N']}
pos1 = {'furiously': 'ADV', 'work': 'V'}
pos2 = {'furiously': 'ADV', 'work': 'V'}


TypeError: unhashable type: 'list'

In [22]:
# Default dictionaries
# When we try to access a key which is not in dictionary we get an error, 
# Python provides an alternative, automatically create an entry with default value(0 or []) for the key which is not present
# In collections module also defaultdict is defined
# nltk provides nltk.defaultdict() to create the dictionary with default value, e.g., int, float, str, list, dict, tuple
# Other than the data types mentioned above we can provide any default value we like-
# - we can provide name of a function without arguments
import nltk
frequency = nltk.defaultdict(int)
frequency['colorless'] = 4
frequency['specs']
frequency['ideas']
print('frequency : {}'.format(frequency.items()))

pos = nltk.defaultdict(list)
pos['sleep'] = ['N', 'V']
pos['ideas']
print('pos : {}'.format(pos))

pos_dd = nltk.defaultdict(lambda: 'N')
pos_dd['colorless'] = 'ADJ'
pos_dd['blog']
print('pos_dd : {}'.format(pos_dd))

frequency : dict_items([('colorless', 4), ('specs', 0), ('ideas', 0)])
pos : defaultdict(<class 'list'>, {'sleep': ['N', 'V'], 'ideas': []})
pos_dd : defaultdict(<function <lambda> at 0x000002A65B929678>, {'colorless': 'ADJ', 'blog': 'N'})


In [48]:
# hapax - word that occurs only once within a context or within a record
# It is difficult to perform many language processing tasks with hapaxes when vocabulary is not fixed
# In below example we are prepocessing text to replace low frequency words with 'UNK' using default dictionay

alice = """Let’s see how default dictionaries could be used in a more substantial language processing
task. Many language processing tasks—including tagging—struggle to correctly
process the hapaxes of a text. They can perform better with a fixed vocabulary
and a guarantee that no new words will appear. We can preprocess a text to replace
low-frequency words with a special “out of vocabulary” token, UNK, with the help of a
default dictionary. (Can you work out how to do this without reading on?)
We need to create a default dictionary that maps each word to its replacement. The
most frequent n words will be mapped to themselves. Everything else will be mapped
to UNK. Two other important word classes are adjectives and adverbs. Adjectives describe
nouns, and can be used as modifiers (e.g., large in the large pizza), or as predicates (e.g.,
the pizza is large). English adjectives can have internal structure (e.g., fall+ing in the
falling stocks). Adverbs modify verbs to specify the time, manner, place, or direction of
the event described by the verb (e.g., quickly in the stocks fell quickly). Adverbs may
also modify adjectives (e.g., really in Mary’s teacher was really nice).
English has several categories of closed class words in addition to prepositions, such
as articles (also often called determiners) (e.g., the, a), modals (e.g., should, may),
and personal pronouns (e.g., she, they). Each dictionary and grammar classifies these
words differently."""
alice = nltk.word_tokenize(alice)
vocab = nltk.FreqDist(alice)
v50 = [word for (word, _) in vocab.most_common(50)]
# defaultdictionary to map each word to its replacement
mapping = nltk.defaultdict(lambda: 'UNX')
for v in v50:
    mapping[v] = v
alice2 = [mapping[v] for v in alice]
for a1, a2 in zip(alice, alice2):
    print('%-15s %-15s'%(a1, a2))

Let             Let            
’               ’              
s               s              
see             see            
how             how            
default         default        
dictionaries    dictionaries   
could           could          
be              be             
used            used           
in              in             
a               a              
more            more           
substantial     UNX            
language        language       
processing      processing     
task            UNX            
.               .              
Many            UNX            
language        language       
processing      processing     
tasks—including UNX            
tagging—struggle UNX            
to              to             
correctly       UNX            
process         UNX            
the             the            
hapaxes         UNX            
of              of             
a               a              
text            text           
.      

In [183]:
# Incrementally Updating a Dictionary
# Dictionaries can be used count the occurances at run time
# In below ex. created a default dictionary, processing the part-of-speech tag in the text
# If tag is not seen before it will have zero count, when tag is encounter we increment its count using += operator
alice = """Let’s see how default dictionaries could be used in a more substantial language processing
task. Many language processing tasks—including tagging—struggle to correctly
process the hapaxes of a text. They can perform better with a fixed vocabulary
and a guarantee that no new words will appear. We can preprocess a text to replace
low-frequency words with a special “out of vocabulary” token, UNK, with the help of a
default dictionary. (Can you work out how to do this without reading on?)
We need to create a default dictionary that maps each word to its replacement. The
most frequent n words will be mapped to themselves. Everything else will be mapped
to UNK. Two other important word classes are adjectives and adverbs. Adjectives describe
nouns, and can be used as modifiers (e.g., large in the large pizza), or as predicates (e.g.,
the pizza is large). English adjectives can have internal structure (e.g., fall+ing in the
falling stocks). Adverbs modify verbs to specify the time, manner, place, or direction of
the event described by the verb (e.g., quickly in the stocks fell quickly). Adverbs may
also modify adjectives (e.g., really in Mary’s teacher was really nice).
English has several categories of closed class words in addition to prepositions, such
as articles (also often called determiners) (e.g., the, a), modals (e.g., should, may),
and personal pronouns (e.g., she, they). Each dictionary and grammar classifies these
words differently."""
from collections import defaultdict
alice = nltk.word_tokenize(alice)
counts = defaultdict(int)
word_list = defaultdict(list)
last_letter = defaultdict(list)
for (word, tag) in nltk.pos_tag(alice):
    counts[tag] += 1
    word_list[tag].append(word)
    last_letter[word[-2:]].append(word)
counts
counts['VB']
len(word_list['VB'])
last_letter['es']

['dictionaries',
 'hapaxes',
 'themselves',
 'classes',
 'adjectives',
 'Adjectives',
 'predicates',
 'adjectives',
 'adjectives',
 'categories',
 'articles',
 'classifies']

In [76]:
# sorting the dictionary by its values
# itemgetter(n) returns a function that can be called on some other sequence object to obtain the nth element
counts = {'VB': 18, 'NNP': 9, 'WRB': 2, 'JJ': 26, 'NNS': 26, 'MD': 12, 'VBN': 5, 'IN': 22, 'DT': 25, 'RBR': 1, 'NN': 37,
          '.': 15, 'VBG': 6, 'TO': 9, 'RB': 11, 'PRP': 7, 'CC': 7, ',': 21, '(': 10, 'RP': 1,  ')': 10, 'VBP': 2, 'WDT': 1,
          'VBZ': 3, 'PRP$': 1, 'RBS': 1, 'CD': 1, 'VBD': 5, 'UH': 1}

#sorted(counts.items())  # same as sorted(counts.items(), key = lambda x: x[0], reverse = False)
#sorted(counts.items(), reverse = True)

#sorted(counts.items(), key = lambda x: x[1], reverse = True)

from operator import itemgetter
sorted(counts.items(), key=itemgetter(1), reverse=True)
[t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)]

['NN',
 'JJ',
 'NNS',
 'DT',
 'IN',
 ',',
 'VB',
 '.',
 'MD',
 'RB',
 '(',
 ')',
 'NNP',
 'TO',
 'PRP',
 'CC',
 'VBG',
 'VBN',
 'VBD',
 'VBZ',
 'WRB',
 'VBP',
 'RBR',
 'RP',
 'WDT',
 'PRP$',
 'RBS',
 'CD',
 'UH']

In [85]:
# example to create anagram (words created by re-arranging the letters) dictionary
alice = """Let us see how default dictionaries could be used in a more substantial language processing
task. Many language processing tasks—including tagging—struggle to correctly entrail
process the hapaxes of a text. They can perform better with a fixed vocabulary latrine
and a guarantee that no new words will appear. We can preprocess a text to replace ratline
low-frequency words with a special “out of vocabulary” token, UNK, with the help of a reliant
default dictionary. (Can you work out how to do this without reading on?) retinal
We need to create a default dictionary that maps each word to its replacement. The trenail
most frequent n words will be mapped to themselves. Everything else will be mapped
to UNK. Two other important word classes are adjectives and adverbs. Adjectives describe
nouns, and can be used as modifiers (e.g., large in the large pizza), or as predicates (e.g.,
the pizza is large). English adjectives can have internal structure (e.g., fall+ing in the
falling stocks). Adverbs modify verbs to specify the time, manner, place, or direction of
the event described by the verb (e.g., quickly in the stocks fell quickly). Adverbs may
also modify adjectives (e.g., really in Mary’s teacher was really nice).
English has several categories of closed class words in addition to prepositions, such
as articles (also often called determiners) (e.g., the, a), modals (e.g., should, may),
and personal pronouns (e.g., she, they). Each dictionary and grammar classifies these
words differently."""
alice = nltk.word_tokenize(alice)
anagrams = nltk.defaultdict(list)
for word in alice:
    key = ''.join(sorted(word))
    anagrams[key].append(word)
anagrams['aeilnrt']

# Since accumulating words like this is such a common task, NLTK provides a more convenient way of creating 
# a defaultdict(list), in the form of nltk.Index(), nltk.Index is a defaultdict(list) with extra support for initialization
# Similarly, nltk.FreqDist is essentially a defaultdict(int) with extra support for initialization (along with sorting 
# and plotting methods)
anagrams2 = nltk.Index((''.join(sorted(w)), w) for w in alice)
anagrams2['aeilnrt']

['entrail', 'latrine', 'ratline', 'reliant', 'retinal', 'trenail']

In [146]:
# Complex Keys and Values
# In below ex. find the range of possible tags for a word, when word itself and the tag of the previous word is given
# nested dictionary: dictionary whose default value for an entry is a dictionary (whose default value is int(), i.e., zero
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news')
nes_dic = nltk.defaultdict(lambda: nltk.defaultdict(int))
# iterating over the bigrams of the tagged corpus, processing a pair of word-tag pairs for each iteration
# updating the dictionary’s entry for (t1, w2), a tag and its following word
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
    nes_dic[(t1, w2)][t2] += 1
    
#1st ('Adverbs', 'NNP') ('modify', 'VBD')  : nes_dic = {('NNP', 'modify') : {'VBD': 1}
#2nd ('modify', 'VBD') ('verbs', 'NNS')    : nes_dic = {('NNP', 'modify') : {'VBD': 1}, ('VBD', 'verbs') : {'NNS': 1}}
#3rd ('verbs', 'NNS') ('to', 'TO')         : nes_dic = {('NNP', 'modify') : {'VBD': 1}, ('VBD', 'verbs') : {'NNS': 1},
#                                                       ('NNS', 'to') : {'TO': 1}}
#4th ('Adverbs', 'NNP') ('modify', 'VBD')  : nes_dic = {('NNP', 'modify') : {'VBD': 2}, ('VBD', 'verbs') : {'NNS': 1},
#                                                       ('NNS', 'to') : {'TO': 1}}
#5th ('Adverbs', 'NNP') ('modify', 'NN')  : nes_dic = {('NNP', 'modify') : {'VBD': 2, 'NN': 1}, ('VBD', 'verbs') : 
#                                                       {'NNS': 1}, ('NNS', 'to') : {'TO': 1}}

for i in range(50,100):
    k = (list(nes_dic.keys()))[i]
    print(k, nes_dic[k])

nes_dic[('AT', 'Fulton')] # defaultdict(int, {'NP-TL': 5, 'NP': 1})
# POS tagger could use this information to decide that the word Fulton, when preceded by a 'AT', 
# should be tagged as 'NP-TL'

('NNS', 'of') defaultdict(<class 'int'>, {'IN': 407})
('NN-TL', 'of') defaultdict(<class 'int'>, {'IN-TL': 119, 'IN': 25})
('IN-TL', 'Atlanta') defaultdict(<class 'int'>, {'NP-TL': 1})
('NP-TL', "''") defaultdict(<class 'int'>, {"''": 7})
("''", 'for') defaultdict(<class 'int'>, {'IN': 3})
('AT', 'manner') defaultdict(<class 'int'>, {'NN': 3})
('NN', 'in') defaultdict(<class 'int'>, {'IN': 580, 'RP': 2})
('IN', 'which') defaultdict(<class 'int'>, {'WDT': 65})
('WDT', 'the') defaultdict(<class 'int'>, {'AT': 30})
('NN', 'was') defaultdict(<class 'int'>, {'BEDZ': 217})
('BEDZ', 'conducted') defaultdict(<class 'int'>, {'VBN': 1})
('VBN', '.') defaultdict(<class 'int'>, {'.': 100})
('AT', 'September-October') defaultdict(<class 'int'>, {'NP': 1})
('NP', 'term') defaultdict(<class 'int'>, {'NN': 1})
('NN', 'jury') defaultdict(<class 'int'>, {'NN': 1})
('NN', 'had') defaultdict(<class 'int'>, {'HVD': 39})
('HVD', 'been') defaultdict(<class 'int'>, {'BEN': 45})
('BEN', 'charged') defaultdict(

defaultdict(int, {'NP-TL': 5, 'NP': 1})

In [178]:
# Inverting a dictionary
# Finding a value for a key is fast and efficient in dictionary, 
# If d is a dictionary and k is a key, type d[k], immediately gives the value
# But finding a key when value is given is slower and more cumbersome
# we need to create new dictionary of value-key pairs from old dictionalry key-value pairs

pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': ['V', 'N'], 'furiously': 'ADV', 'work': 'V'}
pos2 = dict([(tuple(value), key) if isinstance(value, list) else (value, key) for (key, value) in pos.items()])
print("\n***************1st case****************")
print('pos :', pos)
print('pos2 :', pos2)

# pos got updated with some more items, where multiple keys will have the same value
pos.update({'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'})
# here to update pos2 we need to use append method, pos2 is new dictionary
pos2 = nltk.defaultdict(list)
for key, value in pos.items():
    if isinstance(value, list):
        value = tuple(value)
    pos2[value].append(key)
print("\n***************2nd case****************")
print('pos :', pos)
print('pos2 :', pos2)

# we can do inverting dictionary using NLTK's index method
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': ['V', 'N'], 'furiously': 'ADV', 'work': 'V',  
      'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'}
pos2 = nltk.Index((tuple(value), key) if isinstance(value, list) else (value, key) for (key, value) in pos.items())
print("\n***************NLTK 1st case****************")
print('pos :', pos)
print('pos2 :', pos2)


***************1st case****************
pos : {'colorless': 'ADJ', 'ideas': 'N', 'sleep': ['V', 'N'], 'furiously': 'ADV', 'work': 'V'}
pos2 : {'ADJ': 'colorless', 'N': 'ideas', ('V', 'N'): 'sleep', 'ADV': 'furiously', 'V': 'work'}

***************2nd case****************
pos : {'colorless': 'ADJ', 'ideas': 'N', 'sleep': ['V', 'N'], 'furiously': 'ADV', 'work': 'V', 'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'}
pos2 : defaultdict(<class 'list'>, {'ADJ': ['colorless', 'old'], 'N': ['ideas', 'cats'], ('V', 'N'): ['sleep'], 'ADV': ['furiously', 'peacefully'], 'V': ['work', 'scratch']})

***************NLTK 1st case****************
pos : {'colorless': 'ADJ', 'ideas': 'N', 'sleep': ['V', 'N'], 'furiously': 'ADV', 'work': 'V', 'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'}
pos2 : Index(<class 'list'>, {'ADJ': ['colorless', 'old'], 'N': ['ideas', 'cats'], ('V', 'N'): ['sleep'], 'ADV': ['furiously', 'peacefully'], 'V': ['work', 'scratch']})
