In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word.isalnum()]
    return tokens


with open('/content/drive/MyDrive/datasets/nlp/ca1/Tarzan.txt', 'r') as file:
    text_data = file.read().replace('\n', ' ')

tokens = preprocess_text(text_data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [None]:
from collections import defaultdict
import random

class NGramModel:
    def __init__(self, n):
        self.n = n
        self.ngrams = defaultdict(list)
        self.contexts = defaultdict(list)

    def train(self, text):
        words = text
        for i in range(len(words) - self.n):
            context = tuple(words[i:i+self.n-1])
            word = words[i+self.n-1]
            self.ngrams[context].append(word)
            self.contexts[context].append(words[i:i+self.n])

    def generate_text(self, length, start_context):
        generated_text = list(start_context)

        for _ in range(length):
            context = tuple(generated_text[-(self.n-1):])
            while context not in self.ngrams:
                if len(context) == 1:
                    break
                context = context[1:]  # Back off to shorter context
            if context not in self.ngrams:
                break
            next_word = random.choice(self.ngrams[context])
            generated_text.append(next_word)

        return ' '.join(generated_text)



In [None]:
n = 2  # You can change n to any value you prefer
model = NGramModel(n)
model.train(tokens)

generated_text = model.generate_text(12, ('','he'))  # Generate 12 words of text
print("Generated Text:", generated_text)
generated_text = model.generate_text(12, ('','and'))  # Generate 12 words of text
print("Generated Text:", generated_text)

Generated Text:  he already he not know that does not necessarily keep zeyd approached blake
Generated Text:  and the village would do what do so often hunted north of blood


In [None]:
n = 3  # You can change n to any value you prefer
model = NGramModel(n)
model.train(tokens)

generated_text = model.generate_text(12, ('trail','he'))  # Generate 12 words of text
print("Generated Text:", generated_text)
generated_text = model.generate_text(12, ('back','and'))  # Generate 12 words of text
print("Generated Text:", generated_text)

Generated Text: trail he did not return and was deflected from its scabbard he could not
Generated Text: back and usha tore through the black gave no heed either to time since


In [None]:
n = 5  # You can change n to any value you prefer
model = NGramModel(n)
model.train(tokens)

generated_text = model.generate_text(12, ('of','the','trail', 'he'))  # Generate 12 words of text
print("Generated Text:", generated_text)
generated_text = model.generate_text(12, ('on','the','huge', 'back'))  # Generate 12 words of text
print("Generated Text:", generated_text)

Generated Text: of the trail he took short cuts swinging through the branches of the trees a hundred
Generated Text: on the huge back listening to manu the monkey chattering and scolding among the trees then
