In [2]:
# import the libraries
import random
from collections import defaultdict
import re

In [3]:
class MarkovChainTextGenerator:
    def __init__(self, n=2):
        """
        Initialize the Markov Chain Text Generator.

        Parameters:
        n (int): Size of the n-gram (e.g., n=2 → bigram model).

        The model is stored as:
            { (word1, word2): [next_possible_word1, next_possible_word2, ...] }
        """
        self.n = n
        self.model = defaultdict(list)

    def preprocess(self, text):
        """
        Convert text to lowercase and extract clean word tokens.

        Returns:
        List of words.
        """
        # Lowercase entire text for consistency
        text = text.lower()

        # Extract only word characters (removes punctuation)
        tokens = re.findall(r'\b\w+\b', text)

        return tokens

    def train(self, text):
        """
        Build the Markov chain model from the given input text.

        - Breaks the text into n-grams (tuples)
        - For each n-gram, stores the word that follows it
        """
        tokens = self.preprocess(text)

        # Iterate through tokens to form n-grams
        for i in range(len(tokens) - self.n):
            # Example: if n=2 → key = (tokens[i], tokens[i+1])
            key = tuple(tokens[i : i + self.n])

            # Next word after the n-gram
            next_word = tokens[i + self.n]

            # Store next word in the model
            self.model[key].append(next_word)

    def generate(self, max_words=50):
        """
        Generate new text using the trained Markov chain.

        Steps:
        1. Pick a random starting n-gram.
        2. Repeatedly lookup the next word from the model.
        3. Stop if no continuation exists or max_words reached.

        Returns:
        A generated string of text.
        """

        if not self.model:
            raise ValueError("Model is empty. Please train it using train(text).")

        # Randomly choose a starting n-gram
        start = random.choice(list(self.model.keys()))

        # Initialize output list with the starting words
        output = list(start)

        # Generate remaining words
        for _ in range(max_words - self.n):
            # Take the last n words in output to form the current key
            key = tuple(output[-self.n:])

            # Get list of possible next words
            next_words = self.model.get(key)

            # If no continuation found, stop generation
            if not next_words:
                break

            # Randomly choose the next word from candidates
            next_word = random.choice(next_words)
            output.append(next_word)

        # Convert list of words back to a string
        return ' '.join(output)


In [19]:
sample_text = """
    A quick brown fox jumps over the lazy dog. The dog barked and chased the fox.
    The fox ran into the forest and disappeared. The dog returned home tired but happy.
    """

In [20]:
# Create generator with bigram (n=2) model
generator = MarkovChainTextGenerator(n=2)

In [21]:
# Train the model
generator.train(sample_text)

In [22]:
# Generate new text
generated_text = generator.generate(max_words=30)

In [23]:
print("Generated Text:\n", generated_text)

Generated Text:
 the fox ran into the forest and disappeared the dog barked and chased the fox ran into the forest and disappeared the dog returned home tired but happy
