<a href="https://colab.research.google.com/github/rahul-bellam/nlp-lab/blob/main/4th_lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from collections import Counter, defaultdict
import numpy as np
import math
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
corpus = """
This is a sample corpus for language modeling.
It contains multiple sentences.
Language models are used in various NLP applications.
We will implement unigram, bigram, and trigram models.
"""


In [3]:

# Step 2: Tokenize the text
tokens = word_tokenize(corpus.lower())


In [4]:

# Step 3: Generate n-grams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

unigrams = generate_ngrams(tokens, 1)
bigrams = generate_ngrams(tokens, 2)
trigrams = generate_ngrams(tokens, 3)


In [5]:

# Step 4: Count n-grams
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)


In [6]:

# Step 5: Compute probabilities with Laplace Smoothing
def laplace_smoothing(ngram_counts, total_count, vocab_size, k=1):
    smoothed_probs = {}
    for ngram, count in ngram_counts.items():
        smoothed_probs[ngram] = (count + k) / (total_count + k * vocab_size)
    return smoothed_probs

vocab_size = len(set(tokens))
unigram_probs = laplace_smoothing(unigram_counts, sum(unigram_counts.values()), vocab_size)
bigram_probs = laplace_smoothing(bigram_counts, sum(bigram_counts.values()), vocab_size)
trigram_probs = laplace_smoothing(trigram_counts, sum(trigram_counts.values()), vocab_size)


In [7]:

# Step 6: Compute perplexity
def compute_perplexity(test_sentence, ngram_probs, n, vocab_size, k=1):
    test_tokens = word_tokenize(test_sentence.lower())
    test_ngrams = generate_ngrams(test_tokens, n)
    log_prob_sum = 0
    num_ngrams = len(test_ngrams)

    for ngram in test_ngrams:
        prob = ngram_probs.get(ngram, k / (sum(ngram_probs.values()) + k * vocab_size))
        log_prob_sum += math.log(prob)

    perplexity = math.exp(-log_prob_sum / num_ngrams)
    return perplexity


In [8]:

# Example test sentence
test_sentence = "This is a sample"
unigram_perplexity = compute_perplexity(test_sentence, unigram_probs, 1, vocab_size)
bigram_perplexity = compute_perplexity(test_sentence, bigram_probs, 2, vocab_size)
trigram_perplexity = compute_perplexity(test_sentence, trigram_probs, 3, vocab_size)

print("Unigram Perplexity:", unigram_perplexity)
print("Bigram Perplexity:", bigram_perplexity)
print("Trigram Perplexity:", trigram_perplexity)

Unigram Perplexity: 31.0
Bigram Perplexity: 30.499999999999986
Trigram Perplexity: 30.000000000000004


In [9]:

# Example test sentence
test_sentence = "language models are useful"
unigram_perplexity = compute_perplexity(test_sentence, unigram_probs, 1, vocab_size)
bigram_perplexity = compute_perplexity(test_sentence, bigram_probs, 2, vocab_size)
trigram_perplexity = compute_perplexity(test_sentence, trigram_probs, 3, vocab_size)

print("Unigram Perplexity:", unigram_perplexity)
print("Bigram Perplexity:", bigram_perplexity)
print("Trigram Perplexity:", trigram_perplexity)


Unigram Perplexity: 24.892879701299158
Bigram Perplexity: 30.01980174164004
Trigram Perplexity: 29.5296461204668
