In [1]:
import random
import re

def preprocess_corpus(file_name):
    with open(file_name, 'r') as file:
        text = file.read().lower()  # Read and convert to lowercase
        text = re.sub(r'\n', ' ', text)  # Replace newlines with spaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        words = text.split()  # Tokenize into words
    return words

def generate_markov_chain(start_words, output_length, chain_length, file_name):
    word_pairs = {}
    words = preprocess_corpus(file_name)

    # Construct word pairs of given chain length and their following word
    for i in range(len(words) - chain_length):
        current_words = tuple(words[i:i+chain_length])
        next_word = words[i + chain_length]
        if current_words not in word_pairs:
            word_pairs[current_words] = []
        word_pairs[current_words].append(next_word)

    # Print the constructed dictionary
    print("Word pairs dictionary:")
    for key, value in word_pairs.items():
        print(key, "->", value)
    print()

    # Generate the chain
    current_words = start_words
    chain = list(current_words)
    while len(chain) < output_length:
        if current_words in word_pairs:
            next_word_options = [word for word in word_pairs[current_words] if word != current_words[-1]]
            if next_word_options:
                next_word = random.choice(next_word_options)
                chain.append(next_word)
                current_words = current_words[1:] + (next_word,)
            else:
                break
        else:
            break

    # If chain is shorter than the desired length, fill it with random words from the corpus
    while len(chain) < output_length:
        next_word = random.choice(words)
        chain.append(next_word)
        current_words = current_words[1:] + (next_word,)

    return ' '.join(chain)

# Example usage
start_words = ("programming", "is")  # Start word of length two
output_length = 10
chain_length = 2
file_name = "corpus.txt"  # Replace with your file name
generated_sentence = generate_markov_chain(start_words, output_length, chain_length, file_name)
print(generated_sentence)



Word pairs dictionary:
('i', 'love') -> ['programming']
('love', 'programming') -> ['programming']
('programming', 'programming') -> ['is']
('programming', 'is') -> ['fun']
('is', 'fun') -> ['programming']
('fun', 'programming') -> ['makes']
('programming', 'makes') -> ['me']
('makes', 'me') -> ['happy']

programming is fun programming makes me happy fun fun programming
