<a href="https://colab.research.google.com/github/rahul-bellam/nlp-lab/blob/main/Lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install nltk
import nltk
nltk.download('punkt_tab')
import math
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    return tokens

# Function to generate n-grams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))

# Function to train n-gram model
def train_ngram_model(tokens, n):
    return Counter(generate_ngrams(tokens, n))

# Laplace smoothing function
def laplace_smoothing(ngram_counts, n_minus1_counts, vocab_size, ngram):
    return (ngram_counts[ngram] + 1) / (n_minus1_counts[ngram[:-1]] + vocab_size)

# Compute perplexity
def compute_perplexity(test_sentence, ngram_counts, n_minus1_counts, vocab_size, n):
    test_tokens = preprocess_text(test_sentence)
    test_ngrams = generate_ngrams(test_tokens, n)
    log_prob_sum = 0
    for ngram in test_ngrams:
        prob = laplace_smoothing(ngram_counts, n_minus1_counts, vocab_size, ngram)
        log_prob_sum += math.log(prob)
    perplexity = math.exp(-log_prob_sum / len(test_ngrams))
    return perplexity

# Sample corpus
corpus = """
    Natural language processing is a subfield of artificial intelligence.
    It enables computers to understand human language.
    Language models are essential in NLP tasks.
"""

# Preprocessing corpus
tokens = preprocess_text(corpus)
vocab = set(tokens)
vocab_size = len(vocab) + 1  # For smoothing

# Train models
unigram_counts = train_ngram_model(tokens, 1)
bigram_counts = train_ngram_model(tokens, 2)
trigram_counts = train_ngram_model(tokens, 3)

# Compute n-1 gram counts
unigram_context_counts = Counter([ug[0] for ug in generate_ngrams(tokens, 1)])
bigram_context_counts = Counter([bg[0] for bg in generate_ngrams(tokens, 1)])
trigram_context_counts = Counter([tg[:2] for tg in generate_ngrams(tokens, 2)])

# User input for test sentence
test_sentence = input("Enter a sentence: ")
print("Perplexity for Unigram Model:", compute_perplexity(test_sentence, unigram_counts, unigram_context_counts, vocab_size, 1))
print("Perplexity for Bigram Model:", compute_perplexity(test_sentence, bigram_counts, bigram_context_counts, vocab_size, 2))
print("Perplexity for Trigram Model:", compute_perplexity(test_sentence, trigram_counts, trigram_context_counts, vocab_size, 3))



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter a sentence: rahul is a good man
Perplexity for Unigram Model: 17.430740514869576
Perplexity for Bigram Model: 20.490670517227798
Perplexity for Trigram Model: 23.140264701838817


In [7]:
from collections import defaultdict
class LaplaceSmoothing:
    def __init__(self, corpus):
        self.unigrams = defaultdict(int)
        self.bigrams = defaultdict(int)
        self.total_unigrams = 0
        self.vocab_size = 0

        self.train(corpus)

    def train(self, corpus):
        vocab = set()
        for sentence in corpus:
            tokens = sentence.split()
            vocab.update(tokens)
            for i in range(len(tokens)):
                self.unigrams[tokens[i]] += 1
                self.total_unigrams += 1
                if i > 0:
                    self.bigrams[(tokens[i-1], tokens[i])] += 1

        self.vocab_size = len(vocab)

    def bigram_prob(self, word1, word2):
        return (self.bigrams[(word1, word2)] + 1) / (self.unigrams[word1] + self.vocab_size)

corpus = ["the cat sat on the mat", "the dog sat on the mat"]
model = LaplaceSmoothing(corpus)

print(f"P(cat | the): {model.bigram_prob('the', 'cat'):.3f}")


P(cat | the): 0.200


In [8]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from collections import defaultdict
import math

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')

class NGramModel:
    def __init__(self, n, smoothing=0.001): # Changed _init_ to __init__
        """
        Initialize the N-Gram model.
        :param n: Order of the n-gram (1 for unigram, 2 for bigram, etc.)
        :param smoothing: Smoothing factor (Lidstone Smoothing)
        """
        self.n = n
        self.model = defaultdict(lambda: 0)  # Stores n-gram counts
        self.context_counts = defaultdict(lambda: 0)  # Stores (N-1)-gram counts
        self.smoothing = smoothing  # Lidstone smoothing factor
        self.vocab_size = 0  # Vocabulary size

    def tokenize_and_pad(self, corpus):
        """
        Tokenizes and adds padding to the corpus.
        :param corpus: Input text corpus
        :return: Tokenized and padded n-grams list
        """
        tokens = word_tokenize(corpus.lower())  # Convert text to lowercase and tokenize
        self.vocab_size = len(set(tokens))  # Count unique words (Vocabulary size)

        # Generate n-grams with padding
        ngrams_list = list(ngrams(tokens, self.n, pad_left=True, pad_right=True,
                                  left_pad_symbol='<s>', right_pad_symbol='</s>'))
        return ngrams_list

    def train(self, corpus):
        """
        Train the N-Gram model on a given text corpus.
        :param corpus: Input text corpus
        """
        ngrams_list = self.tokenize_and_pad(corpus)  # Tokenize and pad the corpus

        # Count N-grams
        fdist = FreqDist(ngrams_list)
        self.model = fdist  # Store n-gram counts

        # Count (N-1)-grams for probability calculation
        for ngram in ngrams_list:
            context = ngram[:-1]  # Extract (N-1) prefix
            self.context_counts[context] += 1  # Count occurrences

    def raw_probability(self, ngram):
        """
        Compute raw probability without smoothing.
        :param ngram: The n-gram tuple
        :return: Raw probability
        """
        count_ngram = self.model[ngram]  # Count of the full N-gram
        context = ngram[:-1]  # Extract (N-1)-gram
        count_context = self.context_counts[context]  # Count of (N-1)-gram

        # Avoid division by zero
        if count_context == 0:
            return 0
        return count_ngram / count_context

    def smoothed_probability(self, ngram):
        """
        Compute probability with Lidstone smoothing.
        :param ngram: The n-gram tuple
        :return: Smoothed probability
        """
        count_ngram = self.model[ngram]  # Count of the full N-gram
        context = ngram[:-1]  # Extract (N-1)-gram
        count_context = self.context_counts[context]  # Count of (N-1)-gram

        # Apply Lidstone Smoothing
        smoothed_prob = (count_ngram + self.smoothing) / (count_context + self.vocab_size * self.smoothing)
        return smoothed_prob

    def calculate_perplexity(self, test_sentence):
        """
        Compute perplexity of a test sentence.
        :param test_sentence: Input test sentence
        :return: Perplexity value
        """
        ngrams_list = self.tokenize_and_pad(test_sentence)  # Tokenize and pad test sentence

        log_prob_sum = 0
        for ngram in ngrams_list:
            prob_before = self.raw_probability(ngram)  # Raw probability (before smoothing)
            prob_after = self.smoothed_probability(ngram)  # Smoothed probability

            print(f"N-gram: {ngram}, Raw Prob: {prob_before}, Smoothed Prob: {prob_after}")  # Debug Info

            log_prob_sum += math.log2(prob_after)  # Sum log probabilities

        N = len(ngrams_list)  # Total number of n-grams in test sentence
        perplexity = 2 ** (-log_prob_sum / N)  # Compute perplexity
        return perplexity

# Example usage
corpus = "This is a test sentence. This is another test sentence."
test_sentence = "This is a test."

# Unigram Model
print("\n---- Unigram Model ----")
unigram_model = NGramModel(n=1)
unigram_model.train(corpus)
print("Unigram Perplexity:", unigram_model.calculate_perplexity(test_sentence))

# Bigram Model
print("\n---- Bigram Model ----")
bigram_model = NGramModel(n=2)
bigram_model.train(corpus)
print("Bigram Perplexity:", bigram_model.calculate_perplexity(test_sentence))

# Trigram Model
print("\n---- Trigram Model ----")
trigram_model = NGramModel(n=3)
trigram_model.train(corpus)
print("Trigram Perplexity:", trigram_model.calculate_perplexity(test_sentence))


---- Unigram Model ----
N-gram: ('this',), Raw Prob: 0.16666666666666666, Smoothed Prob: 0.16668054977092875
N-gram: ('is',), Raw Prob: 0.16666666666666666, Smoothed Prob: 0.16668054977092875
N-gram: ('a',), Raw Prob: 0.08333333333333333, Smoothed Prob: 0.08338192419825072
N-gram: ('test',), Raw Prob: 0.16666666666666666, Smoothed Prob: 0.16668054977092875
N-gram: ('.',), Raw Prob: 0.16666666666666666, Smoothed Prob: 0.16668054977092875
Unigram Perplexity: 6.890927457103827

---- Bigram Model ----
N-gram: ('<s>', 'this'), Raw Prob: 1.0, Smoothed Prob: 0.9960199004975124
N-gram: ('this', 'is'), Raw Prob: 1.0, Smoothed Prob: 0.9980049875311721
N-gram: ('is', 'a'), Raw Prob: 0.5, Smoothed Prob: 0.4992518703241895
N-gram: ('a', 'test'), Raw Prob: 1.0, Smoothed Prob: 0.9960199004975124
N-gram: ('test', '.'), Raw Prob: 0.0, Smoothed Prob: 0.0004987531172069826
N-gram: ('.', '</s>'), Raw Prob: 0.5, Smoothed Prob: 0.4992518703241895
Bigram Perplexity: 4.4836775498499035

---- Trigram Model --

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
