<a href="https://colab.research.google.com/github/rishisai0811/NLP_ASSIGNMENT/blob/main/NLP_Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import re
import nltk
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Function to tokenize the text into words
def tokenize_text(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    return tokens

# Function to generate unigram, bigram, and trigram models
def generate_ngram_models(text):
    tokens = tokenize_text(text)
    unigram_model = Counter(tokens)
    bigram_model = Counter(bigrams(tokens))
    trigram_model = Counter(trigrams(tokens))

    # Calculate probabilities for each model
    total_unigrams = sum(unigram_model.values())
    total_bigrams = sum(bigram_model.values())
    total_trigrams = sum(trigram_model.values())

    unigram_probabilities = {word: count / total_unigrams for word, count in unigram_model.items()}
    bigram_probabilities = {(word1, word2): count / unigram_model[word1] for (word1, word2), count in bigram_model.items()}
    trigram_probabilities = {(word1, word2, word3): count / bigram_model[(word1, word2)] for (word1, word2, word3), count in trigram_model.items()}

    return unigram_probabilities, bigram_probabilities, trigram_probabilities

# Sample usage
with open('/content/Fahrenheit 451 Full Text .txt', 'r') as file:
    text = file.read()

unigram_probs, bigram_probs, trigram_probs = generate_ngram_models(text)


In [20]:
import random

# Function to generate text using n-gram models
def generate_text(seed_word, unigram_probs, bigram_probs, trigram_probs, num_sentences=5):
    current_word = seed_word
    generated_text = [current_word.capitalize()]

    while len(generated_text) < num_sentences:
        next_word = None

        # Choose next word based on models
        if (current_word,) in unigram_probs:
            next_word = random.choices(list(unigram_probs.keys()), weights=list(unigram_probs.values()))[0]
        elif (current_word,) in bigram_probs:
            next_word = random.choices(list(bigram_probs[(current_word,)].keys()), weights=list(bigram_probs[(current_word,)].values()))[0]
        elif (current_word,) in trigram_probs:
            next_word = random.choices(list(trigram_probs[(current_word,)].keys()), weights=list(trigram_probs[(current_word,)].values()))[0]

        # Handle the case when next_word is None
        if next_word is None:
            next_word = random.choices(list(unigram_probs.keys()), weights=list(unigram_probs.values()))[0]

        # Update current word for the next iteration
        current_word = next_word
        generated_text.append(current_word)

        # If a sentence-ending punctuation is encountered, start a new sentence
        if current_word.endswith(('.', '!', '?')):
            generated_text[-1] = generated_text[-1][:-1]  # Remove the punctuation from the last word
            generated_text.append(random.choice(['', '', 'However', 'Meanwhile', 'In addition', 'Moreover']).capitalize())  # Choose a new sentence starter

    return ' '.join(generated_text) + '.'

# Sample usage
seed_word = 'The'
generated_paragraph = generate_text(seed_word, unigram_probs, bigram_probs, trigram_probs, num_sentences=100)
print(generated_paragraph)

The ear need butter stopped close what his must fill illuminated word might to of looked confuse of help furnace men thing t s furled t crackle up from ago bowling vessel did it the i in turn of t the pages sat don life foot the as hear yammering again notions in her game hand me now it he he out of as face they the sat lonely alone performed the my world combination you go as them s steadily with have if burning belief bones once to like stretched so faced the city the rushed chinese with mosquito.


In [19]:
import math

# Function to calculate perplexity for a given model
def calculate_perplexity(model_probs, tokens):
    entropy = 0
    num_tokens = len(tokens)

    for ngram in model_probs:
        if ngram in model_probs:
            prob = model_probs[ngram]
        else:
            prob = 1 / num_tokens  # If unseen n-gram, assign a small probability

        entropy += -math.log(prob, 2)

    perplexity = 2 ** (entropy / num_tokens)
    return perplexity

# Sample usage
tokens = tokenize_text(text)
unigram_perplexity = calculate_perplexity(unigram_probs, tokens)
bigram_perplexity = calculate_perplexity(bigram_probs, tokens)
trigram_perplexity = calculate_perplexity(trigram_probs, tokens)

print(f'Unigram Perplexity: {unigram_perplexity}')
print(f'Bigram Perplexity: {bigram_perplexity}')
print(f'Trigram Perplexity: {trigram_perplexity}')

Unigram Perplexity: 2.852767714448319
Bigram Perplexity: 7.183178993170789
Trigram Perplexity: 2.195084103640733


In [14]:
# Function to add Laplace smoothing to n-gram models
def add_laplace_smoothing(ngram_model, tokens, alpha=1):
    vocabulary_size = len(set(tokens))
    total_ngrams = len(tokens) - len(ngram_model) + 1

    smoothed_model = defaultdict(lambda: 1 / (total_ngrams + alpha * vocabulary_size))
    for ngram in ngram_model:
        smoothed_model[ngram] = (ngram_model[ngram] + alpha) / (tokens.count(ngram[0]) + alpha * vocabulary_size)

    return smoothed_model

# Applying Laplace smoothing to models
smoothed_unigram_probs = add_laplace_smoothing(unigram_model, tokens)
smoothed_bigram_probs = add_laplace_smoothing(bigram_model, tokens)
smoothed_trigram_probs = add_laplace_smoothing(trigram_model, tokens)

# Calculating perplexity again with smoothed models
smoothed_unigram_perplexity = calculate_perplexity(smoothed_unigram_probs, tokens)
smoothed_bigram_perplexity = calculate_perplexity(smoothed_bigram_probs, tokens)
smoothed_trigram_perplexity = calculate_perplexity(smoothed_trigram_probs, tokens)

print(f'Smoothed Unigram Perplexity: {smoothed_unigram_perplexity}')
print(f'Smoothed Bigram Perplexity: {smoothed_bigram_perplexity}')
print(f'Smoothed Trigram Perplexity: {smoothed_trigram_perplexity}')

Smoothed Unigram Perplexity: 1.0005210347833324
Smoothed Bigram Perplexity: 1.000342354483388
Smoothed Trigram Perplexity: 1.0001650918186362
