# Probabilities and Likelihoods with Bigrams

Recall from a previous video that the probability of a series of words can be calculated from the chained probabilities of its history:

<img src = "./assets/1.png">

The probabilities of sequence occurrences in a large textual corpus can be calculated this way and used as a language model to add grammar and contectual knowledge to a speech recognition system. However, there is a prohibitively large number of calculations for all the possible sequences of varying length in a large textual corpus.

To address this problem, we use the Markov Assumption to approximate a sequence probability with a shorter sequence:

<img src = "./assets/2.png">

In the bigram case, the equation reduces to a series of bigram probabilities multiplied together to find the approximate probability for a sentence. A concrete example:

<img src = "./assets/3.png">

We can calculate the probabilities by using counts of the bigramsand individual tokens. The counts are represented below with the c()c() operator:

<img src = "./assets/4.png">

In Python, the Counter method is useful for this task:

<img src = "./assets/5.png">

In [1]:
def sentence_to_bigrams(sentence):
    """
    Add start '<s>' and stop '</s>' tags to the sentence and tokenize it into a list
    of lower-case words (sentence_tokens) and bigrams (sentence_bigrams)
    :param sentence: string
    :return: list, list
        sentence_tokens: ordered list of words found in the sentence
        sentence_bigrams: a list of ordered two-word tuples found in the sentence
    """
    sentence_tokens = ['<s>'] + sentence.lower().split() + ['</s>']
    sentence_bigrams = []
    for i in range(len(sentence_tokens)-1):
        sentence_bigrams.append((sentence_tokens[i], sentence_tokens[i+1]))
    return sentence_tokens, sentence_bigrams

In [2]:
def bigrams_from_transcript(filename):
    """
    read a file of sentences, adding start '<s>' and stop '</s>' tags; Tokenize it into a list of lower case words
    and bigrams
    :param filename: string 
        filename: path to a text file consisting of lines of non-puncuated text; assume one sentence per line
    :return: list, list
        tokens: ordered list of words found in the file
        bigrams: a list of ordered two-word tuples found in the file
    """
    tokens = []
    bigrams = []
    with open(filename, 'r') as f:
        for line in f:
            line_tokens, line_bigrams = sentence_to_bigrams(line)
            tokens = tokens + line_tokens
            bigrams = bigrams + line_bigrams
    return tokens, bigrams

In [3]:
token, bigram = bigrams_from_transcript("./transcript.txt")

In [4]:
token

['<s>',
 'go',
 'do',
 'you',
 'hear',
 '</s>',
 '<s>',
 'but',
 'in',
 'less',
 'than',
 'five',
 'minutes',
 'the',
 'staircase',
 'groaned',
 'beneath',
 'an',
 'extraordinary',
 'weight',
 '</s>',
 '<s>',
 'at',
 'this',
 'moment',
 'the',
 'whole',
 'soul',
 'of',
 'the',
 'old',
 'man',
 'seemed',
 'centred',
 'in',
 'his',
 'eyes',
 'which',
 'became',
 'bloodshot',
 'the',
 'veins',
 'of',
 'the',
 'throat',
 'swelled',
 'his',
 'cheeks',
 'and',
 'temples',
 'became',
 'purple',
 'as',
 'though',
 'he',
 'was',
 'struck',
 'with',
 'epilepsy',
 'nothing',
 'was',
 'wanting',
 'to',
 'complete',
 'this',
 'but',
 'the',
 'utterance',
 'of',
 'a',
 'cry',
 '</s>',
 '<s>',
 'and',
 'the',
 'cry',
 'issued',
 'from',
 'his',
 'pores',
 'if',
 'we',
 'may',
 'thus',
 'speak',
 'a',
 'cry',
 'frightful',
 'in',
 'its',
 'silence',
 '</s>',
 '<s>',
 'davrigny',
 'rushed',
 'towards',
 'the',
 'old',
 'man',
 'and',
 'made',
 'him',
 'inhale',
 'a',
 'powerful',
 'restorative',
 '</s>

In [5]:
bigram

[('<s>', 'go'),
 ('go', 'do'),
 ('do', 'you'),
 ('you', 'hear'),
 ('hear', '</s>'),
 ('<s>', 'but'),
 ('but', 'in'),
 ('in', 'less'),
 ('less', 'than'),
 ('than', 'five'),
 ('five', 'minutes'),
 ('minutes', 'the'),
 ('the', 'staircase'),
 ('staircase', 'groaned'),
 ('groaned', 'beneath'),
 ('beneath', 'an'),
 ('an', 'extraordinary'),
 ('extraordinary', 'weight'),
 ('weight', '</s>'),
 ('<s>', 'at'),
 ('at', 'this'),
 ('this', 'moment'),
 ('moment', 'the'),
 ('the', 'whole'),
 ('whole', 'soul'),
 ('soul', 'of'),
 ('of', 'the'),
 ('the', 'old'),
 ('old', 'man'),
 ('man', 'seemed'),
 ('seemed', 'centred'),
 ('centred', 'in'),
 ('in', 'his'),
 ('his', 'eyes'),
 ('eyes', 'which'),
 ('which', 'became'),
 ('became', 'bloodshot'),
 ('bloodshot', 'the'),
 ('the', 'veins'),
 ('veins', 'of'),
 ('of', 'the'),
 ('the', 'throat'),
 ('throat', 'swelled'),
 ('swelled', 'his'),
 ('his', 'cheeks'),
 ('cheeks', 'and'),
 ('and', 'temples'),
 ('temples', 'became'),
 ('became', 'purple'),
 ('purple', 'as'),

In [6]:
from collections import Counter

def bigram_mle(tokens, bigrams):
    """
    provide a dictionary of probabilities for all bigrams in a corpus of text
    the calculation is based on maximum likelihood estimation and does not include
    any smoothing.  A tag '<unk>' has been added for unknown probabilities.
    :param tokens: list
        tokens: list of all tokens in the corpus
    :param bigrams: list
        bigrams: list of all two word tuples in the corpus
    :return: dict
        bg_mle_dict: a dictionary of bigrams:
            key: tuple of two bigram words, in order OR <unk> key
            value: float probability
            
    """
    bg_mle_dict = {}
    bg_mle_dict['<unk>'] = 0.

    token_raw_counts = Counter(tokens)
    bigram_raw_counts = Counter(bigrams)
    for bg in bigram_raw_counts:
        bg_mle_dict[bg] = bigram_raw_counts[bg] / token_raw_counts[bg[0]]
    return bg_mle_dict

In [7]:
bigram_maximum_likelihoo_estimation = bigram_mle(token, bigram)

In [8]:
bigram_maximum_likelihoo_estimation

{'<unk>': 0.0,
 ('<s>', 'go'): 0.034482758620689655,
 ('go', 'do'): 0.5,
 ('do', 'you'): 0.625,
 ('you', 'hear'): 0.058823529411764705,
 ('hear', '</s>'): 1.0,
 ('<s>', 'but'): 0.10344827586206896,
 ('but', 'in'): 0.2,
 ('in', 'less'): 0.1111111111111111,
 ('less', 'than'): 1.0,
 ('than', 'five'): 0.5,
 ('five', 'minutes'): 1.0,
 ('minutes', 'the'): 1.0,
 ('the', 'staircase'): 0.025,
 ('staircase', 'groaned'): 1.0,
 ('groaned', 'beneath'): 1.0,
 ('beneath', 'an'): 1.0,
 ('an', 'extraordinary'): 0.5,
 ('extraordinary', 'weight'): 1.0,
 ('weight', '</s>'): 1.0,
 ('<s>', 'at'): 0.034482758620689655,
 ('at', 'this'): 1.0,
 ('this', 'moment'): 0.16666666666666666,
 ('moment', 'the'): 1.0,
 ('the', 'whole'): 0.025,
 ('whole', 'soul'): 1.0,
 ('soul', 'of'): 1.0,
 ('of', 'the'): 0.3,
 ('the', 'old'): 0.1,
 ('old', 'man'): 0.75,
 ('man', 'seemed'): 0.2,
 ('seemed', 'centred'): 1.0,
 ('centred', 'in'): 1.0,
 ('in', 'his'): 0.1111111111111111,
 ('his', 'eyes'): 0.14285714285714285,
 ('eyes', 'whi