In [279]:
# IMPORTS
import math
from collections import Counter
from functools import reduce
from string import punctuation
from typing import Optional

from nltk import sent_tokenize

# Unsmoothed n-grams

To start, you will write a program that computes unsmoothed unigram and bigram probabilities from
the training corpus.

You are given a tokenized opinion spam corpus as input. You may want to do additional preprocessing, based on the design decisions you make.

You may use existing tools just for the purpose of preprocessing, but you must write the code for gathering n-gram counts and computing n-gram probabilities yourself.

For example, consider the simple corpus consisting of the sole sentence: 

`the students like the assignment`

Part of what your program would compute for a unigram and bigram model, for example, would be the
following:

```
P (the) = 0.4, P (like) = 0.2
P (the|like) = 1.0, P (students|the) = 0.5
```

**Preprocessing**: The files included are already tokenized and hence it should be straightforward to
obtain the tokens by using space as the delimiter. Feel free to do any other preprocessing that you might
think is important for this corpus.

In [280]:
# Read in train set
with open('./A1_DATASET/train.txt', 'r') as f:
    train_text = f.read().splitlines()

# Split train set into sentences
train_sents = []
for line in train_text:
    train_sents += sent_tokenize(line)

In [281]:
# Example train sentence
train_sents[3]

'I am looking at a brick wall , and getting no sleep .'

In [282]:
def preprocess(text: str) -> str:
    """
    Preprocesses the text by doing the following:
    - Removt punctuation
    - Convert to lowercase
    """
    text = text.translate(str.maketrans('', '', punctuation))
    text = text.lower()
    
    return text

In [283]:
train = [preprocess(text).split() for text in train_sents]

In [284]:
# Example preprocessed train sentence
print(train[3])

['i', 'am', 'looking', 'at', 'a', 'brick', 'wall', 'and', 'getting', 'no', 'sleep']


In [285]:
def ngrams(text: list, n: int) -> list[tuple]:
    """
    Returns a list of n-grams from the text
    Includes start and end tokens
    """
    start_tokens = n - 1 if n > 1 else 1
    text = ['<s>'] * start_tokens + text + ['</s>']
    return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]

In [286]:
# Compute unigrams
train_unigrams = [ngrams(sent, 1) for sent in train]
train_unigram_counts = reduce(lambda x, y: Counter(x) + Counter(y), train_unigrams)

train_total_tokens = sum(train_unigram_counts.values())
train_vocab_size = len(train_unigram_counts)
train_unigram_probs = {k: v / train_total_tokens for k, v in train_unigram_counts.items()}

In [287]:
# Display top 10 unigram counts
train_unigram_counts.most_common(10)

[(('<s>',), 5406),
 (('</s>',), 5406),
 (('the',), 5302),
 (('and',), 2593),
 (('a',), 2247),
 (('to',), 2090),
 (('was',), 1826),
 (('i',), 1711),
 (('in',), 1260),
 (('we',), 1117)]

In [288]:
# Display top 10 unigram probabilities
sorted(train_unigram_probs.items(), key=lambda x: x[1], reverse=True)[:10]

[(('<s>',), 0.05992617308310517),
 (('</s>',), 0.05992617308310517),
 (('the',), 0.05877332032678942),
 (('and',), 0.028743723049295542),
 (('a',), 0.0249082706100143),
 (('to',), 0.023167906352883794),
 (('was',), 0.020241433971466893),
 (('i',), 0.01896664486592544),
 (('in',), 0.01396725454767157),
 (('we',), 0.012382082007737416)]

In [289]:
# Compute bigrams
train_bigrams = [ngrams(sent, 2) for sent in train]
train_bigram_counts = reduce(lambda x, y: Counter(x) + Counter(y), train_bigrams)
train_bigram_probs = {k: v / train_unigram_counts[(k[0],)] for k, v in train_bigram_counts.items()}

In [290]:
# Display top 10 bigram counts
train_bigram_counts.most_common(10)

[(('<s>', 'the'), 991),
 (('<s>', 'i'), 710),
 (('<s>', 'we'), 462),
 (('the', 'hotel'), 414),
 (('in', 'the'), 412),
 (('of', 'the'), 343),
 (('at', 'the'), 333),
 (('the', 'room'), 295),
 (('and', 'the'), 281),
 (('to', 'the'), 264)]

In [291]:
# Pack unigram and bigram counts into a dictionary
train_ngram_counts = {
    1: train_unigram_counts,
    2: train_bigram_counts
}

# Smoothing and unknown words

Firstly, you should implement at least one method to handle unknown words. 

Then, you will need to implement two smoothing methods (e.g., Laplace, Add-k smoothing) with different values of k. 

Teams can choose any method(s) they prefer for each. The report should clearly state the selected methods, providing a description for any non-standard approach (e.g., an approach not covered in class).

In [292]:
# Replace words with <unk> if they appear less than or equal to "n" times
n = 1
unk_words = {k[0] for k, v in train_unigram_counts.items() if v <= n}

# Replace decided unk_words with <UNK> in train set
unk_train = [
    ["<UNK>" if word in unk_words else word for word in sent] for sent in train
]

In [293]:
# Example sentence with <UNK> tokens
for sent in unk_train:
    if "<UNK>" in sent:
        print(sent)
        break

['when', 'speaking', 'to', 'the', 'front', 'desk', 'i', 'was', 'told', 'that', 'they', 'were', 'simply', '<UNK>', 'my', 'request', 'for', 'an', 'upper', 'floor', 'which', 'i', 'had', 'requested', 'for', 'a', 'better', 'view']


In [294]:
# Compute unigrams
train_unk_unigrams = [ngrams(sent, 1) for sent in unk_train]
train_unk_unigram_counts = reduce(lambda x, y: Counter(x) + Counter(y), train_unk_unigrams)
train_unk_unigram_probs = {k: v / train_total_tokens for k, v in train_unk_unigram_counts.items()}

In [295]:
# Display top 10 unigram counts
train_unk_unigram_counts.most_common(10)

[(('<s>',), 5406),
 (('</s>',), 5406),
 (('the',), 5302),
 (('<UNK>',), 3113),
 (('and',), 2593),
 (('a',), 2247),
 (('to',), 2090),
 (('was',), 1826),
 (('i',), 1711),
 (('in',), 1260)]

In [296]:
# Display top 10 unigram probabilities
sorted(train_unk_unigram_probs.items(), key=lambda x: x[1], reverse=True)[:10]

[(('<s>',), 0.05992617308310517),
 (('</s>',), 0.05992617308310517),
 (('the',), 0.05877332032678942),
 (('<UNK>',), 0.034507986830874283),
 (('and',), 0.028743723049295542),
 (('a',), 0.0249082706100143),
 (('to',), 0.023167906352883794),
 (('was',), 0.020241433971466893),
 (('i',), 0.01896664486592544),
 (('in',), 0.01396725454767157)]

In [297]:
# Compute bigrams
train_unk_bigrams = [ngrams(sent, 2) for sent in unk_train]
train_unk_bigram_counts = reduce(lambda x, y: Counter(x) + Counter(y), train_unk_bigrams)
train_unk_bigram_probs = {k: v / train_unk_unigram_counts[(k[0],)] for k, v in train_unk_bigram_counts.items()}

In [298]:
# Display top 10 bigram counts
train_unk_bigram_counts.most_common(10)

[(('<s>', 'the'), 991),
 (('<s>', 'i'), 710),
 (('<s>', 'we'), 462),
 (('<UNK>', '</s>'), 443),
 (('the', 'hotel'), 414),
 (('in', 'the'), 412),
 (('of', 'the'), 343),
 (('at', 'the'), 333),
 (('the', 'room'), 295),
 (('and', 'the'), 281)]

In [299]:
def laplace_smoothing(ngram: tuple[str, ...], ngram_counts: dict[int, dict], vocab_size: int, total_tokens: Optional[int] = None) -> float:
    """
    Returns the Laplace smoothed probability for a given ngram
    """
    n = len(ngram)
    if n == 1:
        return (ngram_counts[1][ngram] + 1) / (total_tokens + vocab_size)
    else:
        return (ngram_counts[n][ngram] + 1) / (ngram_counts[n-1][ngram[:-1]] + vocab_size)

In [300]:
def add_k_smoothing(ngram: tuple[str, ...], ngram_counts: dict[int, dict], vocab_size: int, total_tokens: Optional[int] = None, k: int = 1) -> float:
    """
    Returns the add-k smoothed probability for a given ngram
    """
    n = len(ngram)
    if n == 1:
        return (ngram_counts[1][ngram] + k) / (total_tokens + k * vocab_size)
    else:
        return (ngram_counts[n][ngram] + k) / (ngram_counts[n-1][ngram[:-1]] + k * vocab_size)

# Perplexity

Implement code to compute the perplexity of a `development set` (*A `development set` is just another way to refer to the validation set—part of a dataset distinct from the training portion)*. Compute and report the perplexity of your model (with variations) on it. Compute perplexity as follows:

![](https://i.gyazo.com/4069027ef1730b89862b96cbb1526b7e.png)

Where:

- N is the total number of tokens in the test corpus.
- P(w_i|w_i1, ..., w_in+1) is the n-gram probability of your model.

Under the second definition above, perplexity is a function of the average (per-word) log probability. Use this to avoid numerical computation errors.

If you experimented with more than one type of smoothing and unknown word handling, you should report and compare the perplexity results of experiments among some of them.

In [301]:
# Read in test set
with open('./A1_DATASET/val.txt', 'r') as f:
    val_text = f.read().splitlines()
    
# Split test set into sentences
val_sents = []
for line in val_text:
    val_sents += sent_tokenize(line)

In [302]:
val = [preprocess(text).split() for text in val_sents]

In [322]:
val_unigrams = [ngrams(sent, 1) for sent in val]
val_bigrams = [ngrams(sent, 2) for sent in val]

val_total_tokens = sum([len(sent) for sent in val_unigrams])

In [305]:
def perplexity(sent_probs: list[float], total_tokens: int, logged: Optional[bool] = False) -> float:
    """
    Returns the perplexity of the test dataset
    """
    if logged:
        l = (1 / total_tokens) * sum(sent_probs)
    else:
        l = (1 / total_tokens) * sum([math.log2(prob) for prob in sent_probs])
    return 2 ** -l

**Perplexity For Train Unigrams**

4 Methods:
- Default model
- \<UNK\> words
- Laplace Smoothing
- Add-K Smoothing
    - k=0.5
    - k=0.05
    - k=0.01

In [306]:
# Default model
train_sent_probs = [
    sum(math.log2(train_unigram_probs[unigram]) for unigram in sent_unigrams)
    for sent_unigrams in train_unigrams
]

print(
    "(Perplexity | Unigrams):",
    perplexity(train_sent_probs, train_total_tokens, logged=True),
)

(Perplexity | Unigrams): 386.9407731234205


In [307]:
# Using <UNK> tokens
train_sent_probs_unk = [
    sum(math.log2(train_unk_unigram_probs[unigram]) for unigram in sent_unigrams)
    for sent_unigrams in train_unk_unigrams
]

print(
    "(Perplexity | Unigrams | <UNK>):",
    perplexity(train_sent_probs_unk, train_total_tokens, logged=True),
)

(Perplexity | Unigrams | <UNK>): 293.1581119212855


In [308]:
# Using Laplace Smoothing
train_sent_probs_lp = [
    sum(
        math.log2(
            laplace_smoothing(
                ngram=unigram,
                ngram_counts=train_ngram_counts,
                vocab_size=train_vocab_size,
                total_tokens=train_total_tokens,
            )
        )
        for unigram in sent_unigrams
    )
    for sent_unigrams in train_unigrams
]

print(
    "(Perplexity | Unigrams | Laplace Smoothing):",
    perplexity(train_sent_probs_lp, train_total_tokens, logged=True),
)

(Perplexity | Unigrams | Laplace Smoothing): 391.5586878701987


In [309]:
# Using Add-k Smoothing
k = [0.5, 0.05, 0.01]
for k_val in k:
    train_sent_probs_addk = [
        sum(
            math.log2(
                add_k_smoothing(
                    ngram=unigram,
                    ngram_counts=train_ngram_counts,
                    vocab_size=train_vocab_size,
                    total_tokens=train_total_tokens,
                    k=k_val,
                )
            )
            for unigram in sent_unigrams
        )
        for sent_unigrams in train_unigrams
    ]
    print(
        f"(Perplexity | Unigrams | Add-K Smoothing | k={k_val}):",
        perplexity(train_sent_probs_addk, train_total_tokens, logged=True),
    )

(Perplexity | Unigrams | Add-K Smoothing | k=0.5): 388.3597028637293
(Perplexity | Unigrams | Add-K Smoothing | k=0.05): 386.9588354529094
(Perplexity | Unigrams | Add-K Smoothing | k=0.01): 386.9415141647935


**Perplexity For Train Bigrams**

4 Methods:
- Default model
- \<UNK\> words
- Laplace Smoothing
- Add-K Smoothing
    - k=0.5
    - k=0.05
    - k=0.01

In [310]:
# Default model
train_sent_probs = [
    sum(math.log2(train_bigram_probs[bigram]) for bigram in sent_bigrams)
    for sent_bigrams in train_bigrams
]

print(
    "(Perplexity | Bigrams):",
    perplexity(train_sent_probs, train_total_tokens, logged=True),
)

(Perplexity | Bigrams): 24.25418362940641


In [311]:
# Using <UNK> tokens
train_sent_probs_unk = [
    sum(math.log2(train_unk_bigram_probs[bigram]) for bigram in sent_bigrams)
    for sent_bigrams in train_unk_bigrams
]

print(
    "(Perplexity | Bigrams | <UNK> Tokens):",
    perplexity(train_sent_probs_unk, train_total_tokens, logged=True),
)

(Perplexity | Bigrams | <UNK> Tokens): 26.208412952642487


In [324]:
# Using Laplace Smoothing
train_sent_probs_lp = [
    sum(
        math.log2(
            laplace_smoothing(
                ngram=bigram,
                ngram_counts=train_ngram_counts,
                vocab_size=train_vocab_size,
            )
        )
        for bigram in sent_bigrams
    )
    for sent_bigrams in train_bigrams
]

print(
    "(Perplexity | Bigrams | Laplace Smoothing):",
    perplexity(train_sent_probs_lp, train_total_tokens, logged=True),
)

(Perplexity | Bigrams | Laplace Smoothing): 543.8729432319209


In [325]:
# Using Add-k Smoothing
k = [0.5, 0.05, 0.01]
for k_val in k:
    train_sent_probs_addk = [
        sum(
            math.log2(
                add_k_smoothing(
                    ngram=bigram,
                    ngram_counts=train_ngram_counts,
                    vocab_size=train_vocab_size,
                    k=k_val,
                )
            )
            for bigram in sent_bigrams
        )
        for sent_bigrams in train_bigrams
    ]
    print(
        f"(Perplexity | Bigrams | Add-K Smoothing | k={k_val}):",
        perplexity(train_sent_probs_addk, train_total_tokens, logged=True),
    )

(Perplexity | Bigrams | Add-K Smoothing | k=0.5): 355.49563626110097
(Perplexity | Bigrams | Add-K Smoothing | k=0.05): 94.56777334360764
(Perplexity | Bigrams | Add-K Smoothing | k=0.01): 49.32755017057055


**Perplexity For Validation Unigrams**

3 Methods:
- \<UNK\> words
- Laplace Smoothing
- Add-K Smoothing
    - k=0.5
    - k=0.05
    - k=0.01

In [314]:
# Using <UNK> tokens
val_sent_probs_unk = [
    math.prod(
        train_unk_unigram_probs.get(unigram, train_unk_unigram_probs[("<UNK>",)])
        for unigram in sent_unigrams
    )
    for sent_unigrams in val_unigrams
]

print(
    "(Perplexity | Unigrams | <UNK> Tokens):",
    perplexity(val_sent_probs_unk, val_total_tokens),
)

(Perplexity | Unigrams | <UNK> Tokens): 259.53523951181285


In [326]:
# Using Laplace Smoothing
val_sent_probs_lp = [
    math.prod(
        laplace_smoothing(
            ngram=unigram,
            ngram_counts=train_ngram_counts,
            vocab_size=train_vocab_size,
            total_tokens=train_total_tokens,
        )
        for unigram in sent_unigrams
    )
    for sent_unigrams in val_unigrams
]

print(
    "(Perplexity | Unigrams | Laplace Smoothing):",
    perplexity(val_sent_probs_lp, val_total_tokens),
)

(Perplexity | Unigrams | Laplace Smoothing): 405.54114005777086


In [327]:
# Using Add-k Smoothing
k = [0.5, 0.05, 0.01]
for k_val in k:
    val_sent_probs_addk = [
        math.prod(
            add_k_smoothing(
                ngram=unigram,
                ngram_counts=train_ngram_counts,
                vocab_size=train_vocab_size,
                total_tokens=train_total_tokens,
                k=k_val,
            )
            for unigram in sent_unigrams
        )
        for sent_unigrams in val_unigrams
    ]
    print(
        f"(Perplexity | Unigrams | Add-K Smoothing | k={k_val}):",
        perplexity(val_sent_probs_addk, val_total_tokens),
    )

(Perplexity | Unigrams | Add-K Smoothing | k=0.5): 408.57817450839923
(Perplexity | Unigrams | Add-K Smoothing | k=0.05): 434.64146521784977
(Perplexity | Unigrams | Add-K Smoothing | k=0.01): 456.93220723622795


**Perplexity For Validation Bigrams**

3 Methods:
- \<UNK\> words
- Laplace Smoothing
- Add-K Smoothing
    - k=0.5
    - k=0.05
    - k=0.01

In [329]:
# Using <UNK> tokens
val_sent_probs_unk = [
    math.prod(
        train_unk_bigram_probs.get(
            tuple("<UNK>" if word in unk_words else word for word in bigram),
            train_unk_bigram_probs[("<UNK>", "<UNK>")],
        )
        for bigram in sent_bigrams
    )
    for sent_bigrams in val_bigrams
]

print(
    "(Perplexity | Bigrams | <UNK> Tokens):",
    perplexity(val_sent_probs_unk, val_total_tokens),
)

(Perplexity | Bigrams | <UNK> Tokens): 20.481719165161444


In [330]:
# Using Laplace Smoothing
val_sent_probs_lp = [
    math.prod(
        laplace_smoothing(
            ngram=bigram, ngram_counts=train_ngram_counts, vocab_size=train_vocab_size
        )
        for bigram in sent_bigrams
    )
    for sent_bigrams in val_bigrams
]

print(
    "(Perplexity | Bigrams | Laplace Smoothing):",
    perplexity(val_sent_probs_lp, val_total_tokens),
)

(Perplexity | Bigrams | Laplace Smoothing): 705.2629317593495


In [331]:
# Using Add-k Smoothing
k = [0.5, 0.05, 0.01]
for k_val in k:
    val_sent_probs_k = [
        math.prod(
            add_k_smoothing(
                ngram=bigram,
                ngram_counts=train_ngram_counts,
                vocab_size=train_vocab_size,
                k=k_val,
            )
            for bigram in sent_bigrams
        )
        for sent_bigrams in val_bigrams
    ]
    print(
        f"(Perplexity | Bigrams | Add-K Smoothing | k={k_val}):",
        perplexity(val_sent_probs_k, val_total_tokens),
    )

(Perplexity | Bigrams | Add-K Smoothing | k=0.5): 525.5916751239168
(Perplexity | Bigrams | Add-K Smoothing | k=0.05): 252.439489240591
(Perplexity | Bigrams | Add-K Smoothing | k=0.01): 209.83712354693142
