In [20]:
import random
import math
from collections import Counter, defaultdict
import re


# Define Tokenization Function

In [21]:
# Custom Tokenizer
def tokenize(text):
    # Split text into words using regular expressions
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens


# Load and Preprocess Corpus

In [22]:
# Sample Corpus
corpus = """
Natural language processing enables machines to understand human language.
Text generation is a key application of NLP. Language models predict text sequences effectively.
"""

# Preprocessing Function
def preprocess_corpus(corpus, vocab_size=5000):
    tokens = tokenize(corpus)  # Tokenize text
    freq_dist = Counter(tokens)  # Count token frequencies
    vocab = set([word for word, _ in freq_dist.most_common(vocab_size)])  # Limit vocabulary size
    processed_tokens = [word if word in vocab else '<UNK>' for word in tokens]  # Replace rare words with <UNK>
    return processed_tokens, vocab

tokens, vocab = preprocess_corpus(corpus)


# Split Data into Train, Validation, and Test Sets

In [23]:
# Split data into training (70%), validation (10%), and testing (20%) sets
def split_data(tokens):
    train_size = int(0.7 * len(tokens))
    val_size = int(0.1 * len(tokens))
    train_data = tokens[:train_size]
    val_data = tokens[train_size:train_size + val_size]
    test_data = tokens[train_size + val_size:]
    return train_data, val_data, test_data

train_data, val_data, test_data = split_data(tokens)


# Build Backoff Model (LM1)

In [24]:
class BackoffModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(Counter)

    def train(self, data):
        for n in range(1, self.n + 1):  # Train for 1-grams to n-grams
            for ngram in zip(*[data[i:] for i in range(n)]):
                self.ngram_counts[n][ngram] += 1

    def get_probability(self, context, word):
        for n in range(len(context), 0, -1):  # Back off from higher-order to lower-order n-grams
            ngram = tuple(context[-n:] + [word])
            if ngram in self.ngram_counts[n]:
                return self.ngram_counts[n][ngram] / sum(self.ngram_counts[n][tuple(context[-n:])].values())
        return 1e-6  # Small default probability for unseen words


# Build Interpolation Model (LM2)

In [26]:
# Train Backoff Model
lm1 = BackoffModel(n=4)
lm1.train(train_data)

# Train Interpolation Model
lambdas = [0.1, 0.2, 0.3, 0.4]
k = 0.1
lm2 = InterpolationModel(n=4, lambdas=lambdas, k=k)
lm2.train(train_data)


# Evaluate Models with Perplexity

In [33]:
class InterpolationModel:
    def __init__(self, n, lambdas, k):
        self.n = n
        self.lambdas = lambdas
        self.k = k
        self.ngram_counts = defaultdict(Counter)

    def train(self, data):
        for n in range(1, self.n + 1):  # Train for 1-grams to n-grams
            for ngram in zip(*[data[i:] for i in range(n)]):
                self.ngram_counts[n][ngram] += 1

    def get_probability(self, context, word):
        probabilities = []
        for n in range(1, self.n + 1):
            # Construct n-gram
            ngram = tuple(context[-(n-1):] + [word])

            # Count of the n-gram
            count = self.ngram_counts[n][ngram] + self.k

            # Count of the context
            context_ngram = tuple(context[-(n-1):])  # Context is n-1 words
            context_count = sum(self.ngram_counts[n][context_ngram].values()) if context_ngram in self.ngram_counts[n] else 0

            # Total count with smoothing
            total = context_count + self.k * len(vocab)

            # Probability calculation
            probabilities.append(count / total if total > 0 else 0)

        # Interpolated probability
        return sum(l * p for l, p in zip(self.lambdas, probabilities))


In [34]:
# Initialize and train InterpolationModel
lambdas = [0.1, 0.2, 0.3, 0.4]  # Example weights
k = 0.1  # Add-k smoothing factor
lm2 = InterpolationModel(n=4, lambdas=lambdas, k=k)
lm2.train(train_data)


In [35]:
# Calculate perplexity for both models
perplexity_lm1 = calculate_perplexity(lm1, test_data)  # Backoff model
perplexity_lm2 = calculate_perplexity(lm2, test_data)  # Interpolation model

# Print results
print(f"Perplexity of Backoff Model (LM1): {perplexity_lm1}")
print(f"Perplexity of Interpolation Model (LM2): {perplexity_lm2}")


Perplexity of Backoff Model (LM1): 999999.9999999999
Perplexity of Interpolation Model (LM2): 20.000000000000004


# Generate Text

In [36]:
def generate_text(model, seed, max_length=50):
    text = seed[:]
    for _ in range(max_length - len(seed)):
        context = text[-3:]
        next_word = max(vocab, key=lambda w: model.get_probability(context, w))
        text.append(next_word)
        if next_word == '<END>':
            break
    return ' '.join(text)

# Example Text Generation
seed = ["natural", "language", "processing"]
generated_text_lm1 = generate_text(lm1, seed)
generated_text_lm2 = generate_text(lm2, seed)

print("Generated Text (Backoff Model):", generated_text_lm1)
print("Generated Text (Interpolation Model):", generated_text_lm2)


Generated Text (Backoff Model): natural language processing language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language language
Generated Text (Interpolation Model): natural language processing enables machines to understand human language text generation is a key application of language text generation is a key application of language text generation is a key application of language text generation is a key application of language text generation is a key application of language text
