# 1. Extract tokens and bigrams from a sentence
In the quiz below, write a function that returns a list of tokens and a list of bigrams for a given sentence. You will need to first break a sentence into words in a list, then add a **< s>** and **< s/>** token to the start and end of the list to represent the start and end of the sentence.

In [1]:
test_sentences = [
    'the old man spoke to me',
    'me to spoke man old the',
    'old man me old man me',
]

In [2]:
def sentence_to_bigrams(sentence):
    """
    Add start '<s>' and stop '</s>' tags to the sentence and tokenize it into a list
    of lower-case words (sentence_tokens) and bigrams (sentence_bigrams)
    :param sentence: string
    :return: list, list
        sentence_tokens: ordered list of words found in the sentence
        sentence_bigrams: a list of ordered two-word tuples found in the sentence
    """
    sentence_tokens = ['<s>'] + sentence.lower().split() + ['</s>']
    sentence_bigrams = []
    for i in range(len(sentence_tokens)-1):
        sentence_bigrams.append((sentence_tokens[i], sentence_tokens[i+1]))
    return sentence_tokens, sentence_bigrams

In [3]:
for index, i_sent in enumerate(test_sentences):
    tokens = sentence_to_bigrams(i_sent)[0]
    bigrams = sentence_to_bigrams(i_sent)[1]
    print("Sentence: ", test_sentences[index])
    print("Tokens: ", tokens)
    print("Bigrams: ", bigrams)
    print("\n")

Sentence:  the old man spoke to me
Tokens:  ['<s>', 'the', 'old', 'man', 'spoke', 'to', 'me', '</s>']
Bigrams:  [('<s>', 'the'), ('the', 'old'), ('old', 'man'), ('man', 'spoke'), ('spoke', 'to'), ('to', 'me'), ('me', '</s>')]


Sentence:  me to spoke man old the
Tokens:  ['<s>', 'me', 'to', 'spoke', 'man', 'old', 'the', '</s>']
Bigrams:  [('<s>', 'me'), ('me', 'to'), ('to', 'spoke'), ('spoke', 'man'), ('man', 'old'), ('old', 'the'), ('the', '</s>')]


Sentence:  old man me old man me
Tokens:  ['<s>', 'old', 'man', 'me', 'old', 'man', 'me', '</s>']
Bigrams:  [('<s>', 'old'), ('old', 'man'), ('man', 'me'), ('me', 'old'), ('old', 'man'), ('man', 'me'), ('me', '</s>')]




# 2. Calculate probabilities for bigrams
In the quiz below, write a function that returns a probability dictionary when given a lists of tokens and bigrams.

In [4]:
# Importing the libraries
from collections import Counter

In [5]:
def bigrams_from_transcript(filename):
    """
    read a file of sentences, adding start '<s>' and stop '</s>' tags; Tokenize it into a list of lower case words
    and bigrams
    :param filename: string 
        filename: path to a text file consisting of lines of non-puncuated text; assume one sentence per line
    :return: list, list
        tokens: ordered list of words found in the file
        bigrams: a list of ordered two-word tuples found in the file
    """
    tokens = []
    bigrams = []
    with open(filename, 'r') as f:
        for line in f:
            line_tokens, line_bigrams = sentence_to_bigrams(line)
            tokens = tokens + line_tokens
            bigrams = bigrams + line_bigrams
    return tokens, bigrams

In [6]:
tokens, bigrams = bigrams_from_transcript("./transcripts.txt")
print("Tokens: \n\n", tokens, "\n\n\n")
print("Bigrams: \n\n", bigrams)

Tokens: 

 ['<s>', 'go', 'do', 'you', 'hear', '</s>', '<s>', 'but', 'in', 'less', 'than', 'five', 'minutes', 'the', 'staircase', 'groaned', 'beneath', 'an', 'extraordinary', 'weight', '</s>', '<s>', 'at', 'this', 'moment', 'the', 'whole', 'soul', 'of', 'the', 'old', 'man', 'seemed', 'centred', 'in', 'his', 'eyes', 'which', 'became', 'bloodshot', 'the', 'veins', 'of', 'the', 'throat', 'swelled', 'his', 'cheeks', 'and', 'temples', 'became', 'purple', 'as', 'though', 'he', 'was', 'struck', 'with', 'epilepsy', 'nothing', 'was', 'wanting', 'to', 'complete', 'this', 'but', 'the', 'utterance', 'of', 'a', 'cry', '</s>', '<s>', 'and', 'the', 'cry', 'issued', 'from', 'his', 'pores', 'if', 'we', 'may', 'thus', 'speak', 'a', 'cry', 'frightful', 'in', 'its', 'silence', '</s>', '<s>', 'davrigny', 'rushed', 'towards', 'the', 'old', 'man', 'and', 'made', 'him', 'inhale', 'a', 'powerful', 'restorative', '</s>', '<s>', 'davrigny', 'unable', 'to', 'bear', 'the', 'sight', 'of', 'this', 'touching', 'emotio

In [7]:
def sentence_to_bigrams(sentence):
    """
    Add start '<s>' and stop '</s>' tags to the sentence and tokenize it into a list
    of lower-case words (sentence_tokens) and bigrams (sentence_bigrams)
    :param sentence: string
    :return: list, list
        sentence_tokens: ordered list of words found in the sentence
        sentence_bigrams: a list of ordered two-word tuples found in the sentence
    """
    sentence_tokens = ['<s>'] + sentence.lower().split() + ['</s>']
    sentence_bigrams = []
    for i in range(len(sentence_tokens)-1):
        sentence_bigrams.append((sentence_tokens[i], sentence_tokens[i+1]))
    return sentence_tokens, sentence_bigrams


In [8]:
from collections import Counter

def bigram_mle(tokens, bigrams):
    """
    provide a dictionary of probabilities for all bigrams in a corpus of text
    the calculation is based on maximum likelihood estimation and does not include
    any smoothing.  A tag '<unk>' has been added for unknown probabilities.
    :param tokens: list
        tokens: list of all tokens in the corpus
    :param bigrams: list
        bigrams: list of all two word tuples in the corpus
    :return: dict
        bg_mle_dict: a dictionary of bigrams:
            key: tuple of two bigram words, in order OR <unk> key
            value: float probability
            
    """
    bg_mle_dict = {}
    bg_mle_dict['<unk>'] = 0.

    token_raw_counts = Counter(tokens)
    bigram_raw_counts = Counter(bigrams)
    for bg in bigram_raw_counts:
        bg_mle_dict[bg] = bigram_raw_counts[bg] / token_raw_counts[bg[0]]
    return bg_mle_dict

In [9]:
bg_mle_dict = bigram_mle(tokens, bigrams)
bg_mle_dict

{'<unk>': 0.0,
 ('<s>', 'go'): 0.034482758620689655,
 ('go', 'do'): 0.5,
 ('do', 'you'): 0.625,
 ('you', 'hear'): 0.058823529411764705,
 ('hear', '</s>'): 1.0,
 ('<s>', 'but'): 0.10344827586206896,
 ('but', 'in'): 0.2,
 ('in', 'less'): 0.1111111111111111,
 ('less', 'than'): 1.0,
 ('than', 'five'): 0.5,
 ('five', 'minutes'): 1.0,
 ('minutes', 'the'): 1.0,
 ('the', 'staircase'): 0.025,
 ('staircase', 'groaned'): 1.0,
 ('groaned', 'beneath'): 1.0,
 ('beneath', 'an'): 1.0,
 ('an', 'extraordinary'): 0.5,
 ('extraordinary', 'weight'): 1.0,
 ('weight', '</s>'): 1.0,
 ('<s>', 'at'): 0.034482758620689655,
 ('at', 'this'): 1.0,
 ('this', 'moment'): 0.16666666666666666,
 ('moment', 'the'): 1.0,
 ('the', 'whole'): 0.025,
 ('whole', 'soul'): 1.0,
 ('soul', 'of'): 1.0,
 ('of', 'the'): 0.3,
 ('the', 'old'): 0.1,
 ('old', 'man'): 0.75,
 ('man', 'seemed'): 0.2,
 ('seemed', 'centred'): 1.0,
 ('centred', 'in'): 1.0,
 ('in', 'his'): 0.1111111111111111,
 ('his', 'eyes'): 0.14285714285714285,
 ('eyes', 'whi

# 3. Calculate the log probability of a given sentence based on a corpus of text using bigrams

In the following quiz, a utility named `bigram_add1_log` has been added for you with Laplace smoothing in the log space. Write a function that calculates the log probability for a given sentence, using this log probability dictionary. If all goes well, you should observe that more likely sentences yield higher values for the log probabilities.

In [10]:
# Importing the libraries
import numpy as np

In [11]:
def bigram_add1_logs(transcript_file):
    """
    provide a smoothed log probability dictionary based on a transcript
    :param transcript_file: string
        transcript_file is the path filename containing unpunctuated text sentences
    :return: dict
        bg_add1_log_dict: dictionary of smoothed bigrams log probabilities including
        tags: <s>: start of sentence, </s>: end of sentence, <unk>: unknown placeholder probability
    """

    tokens, bigrams = bigrams_from_transcript(transcript_file)
    token_counts = Counter(tokens)
    bigram_counts = Counter(bigrams)
    vocab_count = len(token_counts)

    bg_addone_dict = {}
    for bg in bigram_counts:
        bg_addone_dict[bg] = np.log((bigram_counts[bg] + 1.) / (token_counts[bg[0]] + vocab_count))
    bg_addone_dict['<unk>'] = np.log(1. / vocab_count)
    return bg_addone_dict


In [12]:
bg_addone_dict = bigram_add1_logs("./transcripts.txt")
bg_addone_dict

{('<s>', 'go'): -5.0271645960474665,
 ('go', 'do'): -4.9344739331306915,
 ('do', 'you'): -3.857214768933151,
 ('you', 'hear'): -4.987025428457122,
 ('hear', '</s>'): -4.930870325627393,
 ('<s>', 'but'): -4.334017415487521,
 ('but', 'in'): -4.945207488773801,
 ('in', 'less'): -4.959341999708705,
 ('less', 'than'): -4.930870325627393,
 ('than', 'five'): -4.9344739331306915,
 ('five', 'minutes'): -4.930870325627393,
 ('minutes', 'the'): -4.930870325627393,
 ('the', 'staircase'): -5.062595033026967,
 ('staircase', 'groaned'): -4.930870325627393,
 ('groaned', 'beneath'): -4.930870325627393,
 ('beneath', 'an'): -4.930870325627393,
 ('an', 'extraordinary'): -4.9344739331306915,
 ('extraordinary', 'weight'): -4.930870325627393,
 ('weight', '</s>'): -4.930870325627393,
 ('<s>', 'at'): -5.0271645960474665,
 ('at', 'this'): -4.930870325627393,
 ('this', 'moment'): -4.948759890378168,
 ('moment', 'the'): -4.930870325627393,
 ('the', 'whole'): -5.062595033026967,
 ('whole', 'soul'): -4.930870325627

In [13]:
def log_prob_of_sentence(bigrams, bigram_log_dict):
    # add the log probabilites of the bigrams in the sentence
    total_log_prob = 0.
    for bg in bigrams:
        if bg in bigram_log_dict:
            total_log_prob = total_log_prob + bigram_log_dict[bg]
        else:
            total_log_prob = total_log_prob + bigram_log_dict['<unk>']
    return total_log_prob

In [47]:
log_prob_of_sentence = log_prob_of_sentence(bigrams, bg_addone_dict)
log_prob_of_sentence

-2726.543310078376