#### 1. Write a function word_freq() that takes a word and the name of a section of the Brown Corpus as arguments, and computes the frequency of the word in that section of the corpus.


In [1]:
import nltk
from nltk.corpus import brown
from nltk.probability import FreqDist

In [2]:
def word_freq(word, sec):
    """
    Input:
        word (str): word for which frequency count is to be calculated.
        sec (str): a category of brown corpus.
    Returns:
        num (int): frequency of `word` in `sec` section of brown corpus.
    """
    freq_dist = FreqDist(brown.words(categories=sec))
    num = freq_dist[word]
    return num

In [3]:
word_freq('India', 'news')

5

#### 2. Write a program to guess the number of syllables contained in a text, making use of the CMU Pronouncing Dictionary.

In [4]:
def get_syllables(txt):
    """
    Input:
        txt (list): A list of tokens
    Returns:
        syll_num (int): Return the number of syllables in `txt`.
    """
    cmud = nltk.corpus.cmudict.dict()
    syll_text = []
    for word in txt:
        syll_text.extend(cmud[word.lower()][0])
    syll_num = len(syll_text)
    return syll_num

In [5]:
txt = ["Luke", "I'm", "your", "father"]
print(get_syllables(txt))

12


#### 3. Write a function that finds the 50 most frequently occurring words of a text that are not stopwords.

In [6]:
def most_freq_non_stopwords(text, n=50):
    """
    Input:
        text (list): A list of tokens
    Returns:
        int (list): Return most frequent non-stopwords in `text`.
    """
    stopwords = nltk.corpus.stopwords.words('english')
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in text if w not in stopwords)
    most_freq = allWordExceptStopDist.most_common(n)
    return [x for (x, y) in most_freq]

In [7]:
print(most_freq_non_stopwords(brown.words(categories='news')))

[',', '.', 'the', '``', "''", 'said', ';', '--', 'mrs.', 'would', 'new', 'one', 'he', 'i', 'last', 'two', ')', 'mr.', '(', 'first', 'state', ':', 'year', 'president', 'a', 'home', 'also', 'in', 'it', 'but', 'made', 'time', 'years', 'three', 'house', 'week', 'city', 'may', '?', 'school', 'could', 'four', 'day', 'committee', 'man', 'members', 'back', 'government', 'many', 'national']


#### 4. Write a Python NLTK program to compare the similarity of two given verbs.

In [8]:
from nltk.corpus import wordnet

def calc_similarity(verb1, verb2):
    """
    Input:
        verb1 (str): verb 1
        verb2 (str): verb 2
    Returns:
        sim (float): The extent of similarity between the two words.
    """
    v1 = wordnet.synset(verb1)
    v2 = wordnet.synset(verb2)
    sim = v1.wup_similarity(v2)
    return sim

In [9]:
print(calc_similarity('sprint.v.01', 'run.v.01'))

0.8571428571428571


#### 5. Define a function find_language() that takes a string as its argument, and returns a list of languages that have that string as a word. Use the udhr corpus and limit your searches to files in the Latin-1 encoding.

In [10]:
from nltk.corpus import udhr

def find_language(string):
    """
    Input:
        string (str): a word in any language
    Returns:
        target_lang (list): List of languages in which the `string` appears.
    """
    languages = [name for name in udhr.fileids() if 'Latin1' in name]
    target_lang = [lang for lang in languages if string in udhr.words(lang)]
    return target_lang

In [11]:
find_language('homme')

['French_Francais-Latin1']

#### 6. The polysemy of a word is the number of senses it has. Using WordNet, we can determine that the noun dog has 7 senses with: len(wn.synsets('dog', 'n')). Compute the average polysemy of nouns, verbs, adjectives and adverbs according to WordNet.

In [12]:
def avg_polysemy(groups):
    """
    Input:
        groups (dict): a dict of wordnet groups and respective object.
    Returns:
        None
    """
    all_synsets_len = len([synst for synst in wordnet.all_synsets()])
    
    for (k, v) in groups.items():
        synset_len = len([synst for synst in wordnet.all_synsets(pos=v)])
        print('{0}\t::\t{1}'.format(k, all_synsets_len/synset_len))

In [13]:
groups = {'Verb': wordnet.VERB, 'Noun': wordnet.NOUN,
          'Adj': wordnet.ADJ, 'Adv': wordnet.ADV}

avg_polysemy(groups)

Verb	::	8.546451659766108
Noun	::	1.4328563599829507
Adj	::	6.480447235073805
Adv	::	32.49351008008837


#### 7. Define a function hedge(text) which processes a text and produces a new version with the word 'like' between every third word.

In [14]:
def hedge(text_list):
    """
    Input:
        text_list (str): list of strings
    Returns:
        text_list (str): list of strings with like inserted after every 3rd 
    """
    
    total = len(text_list)//4
    counter = 1
    for i in range(total):
        text_list.insert(((i+1)*4 - counter), 'LIKES')

    return text_list

In [15]:
text = ['Captain', 'Hans', 'Solo', 'Princess', 'Leia', 'Organa', 'Master', 'Luke', 'Skywalker']

print(hedge(text))

['Captain', 'Hans', 'Solo', 'LIKES', 'Princess', 'Leia', 'Organa', 'LIKES', 'Master', 'Luke', 'Skywalker']
