In [219]:
import re
from collections import Counter   

In [221]:
def read_file(filename):
    with open(filename, 'r') as file:
        text = file.read().lower() 
    return text

def preprocess_text(text):
    words = re.findall(r'\b\w+\b', text)
    return words

In [222]:
filename = "J. K. Rowling - Harry Potter 4 - The Goblet of Fire.txt"
text = read_file(filename)

corpus = preprocess_text(text)

In [None]:
def create_ngram_model(corpus, n):
  ngram_model = {}
  for i in range(len(corpus) - n + 1):  
    gram = tuple(corpus[i:i+n-1])  
    if gram not in ngram_model:
      ngram_model[gram] = Counter()
    next_word = corpus[i+n] if i + n < len(corpus) else None  
    if next_word:
        ngram_model[gram][next_word] += 1 
  for gram, word_counts in ngram_model.items():
    total_count = sum(word_counts.values())
    ngram_model[gram] = {word: count / total_count for word, count in word_counts.items()}
  return ngram_model

def generate_word(ngram_model, seed_gram):
 
  if seed_gram not in ngram_model:
    
    max_prob = 0
    most_probable_word = None
    for gram, word_probs in ngram_model.items():
      for word, prob in word_probs.items():
        if prob > max_prob:
          max_prob = prob
          most_probable_word = word
    return most_probable_word
  word_probs = ngram_model[seed_gram]
 
  return max(word_probs, key=word_probs.get)

def generate_sentence(ngram_model, seed_words, num_words):
 
  sentence = list(seed_words)
  for _ in range(num_words):
    current_gram = tuple(sentence[-n:]) 
    next_word = generate_word(ngram_model, current_gram)
    sentence.append(next_word)
  return sentence


In [223]:
#for unigram
model = create_ngram_model(corpus, 1)  
seed_words = ["he"] 
sentence = generate_sentence(model, seed_words, 100) 
print(" ".join(sentence))

he the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


In [224]:
#for bigram
model = create_ngram_model(corpus, 2)  
seed_words = ["he","had"] 
sentence = generate_sentence(model, seed_words, 100) 
print(" ".join(sentence))

he had rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling


In [225]:
#for trigram
model = create_ngram_model(corpus, 3)  
seed_words = ["he","had","awoken"] 
sentence = generate_sentence(model, seed_words, 100) 
print(" ".join(sentence))

he had awoken rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling rowling


In [226]:
#interpolation
def generate_word_interpolation(ngram_model, current_gram, weights):
    probabilities = {}
    for gram_size, weight in weights.items():
        gram = current_gram[-gram_size:]
        if gram in ngram_model:
            for word, prob in ngram_model[gram].items():
                probabilities[word] = probabilities.get(word, 0) + prob * weight
    
    if not probabilities:
        return None
    
    return max(probabilities, key=probabilities.get)

def generate_sentence_interpolation(ngram_model, seed_words, num_words, weights):
    sentence = list(seed_words)
    while len(sentence) < num_words:
        current_gram = tuple(sentence[-max(weights.keys()):]) 
        next_word = generate_word_interpolation(ngram_model, current_gram, weights)
        if next_word is None:
            break
        sentence.append(next_word)
    return sentence

model = create_ngram_model(corpus, 3)  
weights = {1: 0.1, 2: 0.4, 3: 0.5} 
seed_words = ["he", "had", "awoken"]
sentence = generate_sentence_interpolation(model, seed_words, 100, weights)
print(" ".join(sentence))


he had awoken his
