Assignment 7

In [None]:
import re
from collections import defaultdict, Counter
import random

class NgramAutocomplete:
    def __init__(self, n=3):
        self.n = n  # N-gram size
        self.ngrams = defaultdict(Counter)  # Stores n-grams and their frequencies

    def preprocess_text(self, text):
        """Preprocess the text: Lowercase, remove special characters, and split into words."""
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric characters
        return text.split()

    def train(self, corpus):
        """Train the model on a corpus of text."""
        tokens = self.preprocess_text(corpus)
        for i in range(len(tokens) - self.n + 1):
            prefix = tuple(tokens[i:i + self.n - 1])  # First (n-1) tokens
            next_word = tokens[i + self.n - 1]  # nth token
            self.ngrams[prefix][next_word] += 1  # Increment frequency

    def predict_next(self, context, top_k=3):
        """Predict the next word given a context of (n-1) words."""
        context = tuple(self.preprocess_text(context)[-self.n + 1:])  # Use last (n-1) words
        if context in self.ngrams:
            # Get the most probable next words
            predictions = self.ngrams[context].most_common(top_k)
            return [word for word, _ in predictions]
        else:
            return []  # No prediction available for unseen context

    def generate_text(self, start_text, max_words=20):
        """Generate text starting with a given phrase."""
        generated = self.preprocess_text(start_text)
        for _ in range(max_words):
            context = tuple(generated[-self.n + 1:])  # Use last (n-1) words as context
            next_word = self.predict_next(' '.join(context), top_k=1)
            if not next_word:
                break  # Stop if no next word is predicted
            generated.append(next_word[0])
        return ' '.join(generated)

# Example Usage
if __name__ == "__main__":
    # Training corpus
    corpus = """
    Machine learning is a field of artificial intelligence that uses algorithms to learn from data.
    Natural language processing is a subfield of machine learning.
    Auto-complete systems rely on predictive models.
    """

    model = NgramAutocomplete(n=3)  # Create a trigram model
    model.train(corpus)  # Train on the corpus

    # Predictions
    context = "machine learning is"
    predictions = model.predict_next(context)
    print(f"Predictions for '{context}': {predictions}")

    # Generate text
    start_text = "machine learning"
    generated_text = model.generate_text(start_text, max_words=10)
    print(f"Generated text: {generated_text}")
