<a href="https://colab.research.google.com/github/sreevanimtcs2502/sreevanimtcs2502/blob/main/ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
import re
import random
from collections import defaultdict, Counter

POE_URLS = [
    "https://www.gutenberg.org/files/2147/2147-0.txt",
    "https://www.gutenberg.org/files/2148/2148-0.txt",
    "https://www.gutenberg.org/files/2149/2149-0.txt"
]

def load_gutenberg_corpus(urls):
    texts = []
    for url in urls:
        response = requests.get(url)
        response.raise_for_status()
        texts.append(response.text)
    return "\n".join(texts)

def clean_gutenberg_text(text):
    start = re.search(r"\*\*\* START OF.*?\*\*\*", text, re.IGNORECASE)
    end = re.search(r"\*\*\* END OF.*?\*\*\*", text, re.IGNORECASE)
    if start and end:
        text = text[start.end():end.start()]
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def tokenize(text):
    return text.split()

class NGramLanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocab = set()

    def train(self, tokens):
        for i in range(len(tokens) - self.n + 1):
            context = tuple(tokens[i:i+self.n-1])
            target = tokens[i+self.n-1]
            self.ngram_counts[context][target] += 1
            self.context_counts[context] += 1
            self.vocab.add(target)

    def next_word_distribution(self, context):
        vocab_size = len(self.vocab)
        total = self.context_counts[context] + vocab_size
        words, probs = [], []
        for word in self.vocab:
            count = self.ngram_counts[context][word]
            words.append(word)
            probs.append((count + 1) / total)
        return words, probs

    def generate(self, seed_text, max_len=40):
        seed_tokens = seed_text.lower().split()
        if len(seed_tokens) < self.n - 1:
            raise ValueError("Seed text must contain at least 4 words")
        generated = seed_tokens[:]
        for _ in range(max_len):
            context = tuple(generated[-(self.n-1):])
            words, probs = self.next_word_distribution(context)
            generated.append(random.choices(words, probs)[0])
        return " ".join(generated)

raw_text = load_gutenberg_corpus(POE_URLS)
clean_text = clean_gutenberg_text(raw_text)
tokens = tokenize(clean_text)

model = NGramLanguageModel(n=5)
model.train(tokens)

samples = [
    "the day was very",
    "i felt a strange",
    "there was something about the "
]

for prompt in samples:
    print("Input :", prompt)
    print("Output:", model.generate(prompt))
    print()


Input : the day was very
Output: the day was very beamends absolutely reap demonstrated messieurs alternative locked topics inclined internally nom thronged wager repetition massade punch crowbar streaks burial alland profounder astonishing modes accomplished lui wormeaten volume two delicate arm touch gnarled delicate suspicious paralleling discharge orthographically bleeding snugly collation

Input : i felt a strange
Output: i felt a strange sense furnishing wicker wondering concerned enckes mercurie quondam bags artizan non dreaming trusted lanterns highway listen changeless seemingly survives expressive gown advanced enthusiasm wiser daybreak personalities sinks notorious carryingmore method resign solitudes antagonistic with sitting sulky jourdains odd revenue impunity

Input : there was something about the 
Output: there was something about the shrivelled identical mystic earnest hating editions comprehended quinaultatys rapid meeting treated ruffles skies herein room broke schem