In [1]:
from collections import Counter
import utils
import numpy as np


def sample_run():
    # sample usage by test code (this definition not actually run for the quiz)
    tokens, bigrams = utils.bigrams_from_transcript('transcripts.txt')
    bg_dict = bigram_mle(tokens, bigrams)
    print(bg_dict)


def bigram_mle(tokens, bigrams):
    """
    provide a dictionary of probabilities for all bigrams in a corpus of text
    the calculation is based on maximum likelihood estimation and does not include
    any smoothing.  A tag '<unk>' has been added for unknown probabilities.
    :param tokens: list
        tokens: list of all tokens in the corpus
    :param bigrams: list
        bigrams: list of all two word tuples in the corpus
    :return: dict
        bg_mle_dict: a dictionary of bigrams:
            key: tuple of two bigram words, in order OR <unk> key
            value: float probability

    """
    bg_mle_dict = {}
    bg_mle_dict['<unk>'] = 0.
    
    token_counts = Counter(tokens)
    bigram_counts = Counter(bigrams)
    
    for bigram, bigram_count in bigram_counts.items():
        bg_mle_dict[bigram] = bigram_count / (token_counts[bigram[0]] + 1e-12)
    return bg_mle_dict

# output results
tokens, bigrams = utils.bigrams_from_transcript('transcripts.txt')
bg_dict = bigram_mle(tokens, bigrams)
print("Probability bigram dictionary:")
print(bg_dict)

Probability bigram dictionary:
{'<unk>': 0.0, ('<s>', 'go'): 0.034482758620689655, ('go', 'do'): 0.5, ('do', 'you'): 0.625, ('you', 'hear'): 0.058823529411764705, ('hear', '</s>'): 1.0, ('<s>', 'but'): 0.10344827586206896, ('but', 'in'): 0.2, ('in', 'less'): 0.1111111111111111, ('less', 'than'): 1.0, ('than', 'five'): 0.5, ('five', 'minutes'): 1.0, ('minutes', 'the'): 1.0, ('the', 'staircase'): 0.025, ('staircase', 'groaned'): 1.0, ('groaned', 'beneath'): 1.0, ('beneath', 'an'): 1.0, ('an', 'extraordinary'): 0.5, ('extraordinary', 'weight'): 1.0, ('weight', '</s>'): 1.0, ('<s>', 'at'): 0.034482758620689655, ('at', 'this'): 1.0, ('this', 'moment'): 0.16666666666666666, ('moment', 'the'): 1.0, ('the', 'whole'): 0.025, ('whole', 'soul'): 1.0, ('soul', 'of'): 1.0, ('of', 'the'): 0.3, ('the', 'old'): 0.1, ('old', 'man'): 0.75, ('man', 'seemed'): 0.2, ('seemed', 'centred'): 1.0, ('centred', 'in'): 1.0, ('in', 'his'): 0.1111111111111111, ('his', 'eyes'): 0.14285714285714285, ('eyes', 'which')