# CS6320 - NLP - Assignment 1 - Group 14
1. Sai Vikas Thiruveedula (SXT230026)
2. Vijay Sai Dukkipati (DXV220040)
3. Manikanta Sai Kommireddy (MXK220132)

##Importing the Dataset

In [8]:
import requests
url="https://raw.githubusercontent.com/saivikas10/Assignment/refs/heads/main/A1_DATASET/train.txt"
response = requests.get(url)


with open('train.txt', 'w') as file:
    file.write(response.text)

url="https://raw.githubusercontent.com/saivikas10/Assignment/refs/heads/main/A1_DATASET/val.txt"
response = requests.get(url)


with open('val.txt', 'w') as file:
    file.write(response.text)

##Import Libraries and Define Helper Functions

In [9]:
import re
import math
from collections import defaultdict

# Global dictionary for storing the results and to display at last
results = {}

#Perplexity calculation function
def calculate_perplexity_bigram(train_prob, val_bigram, train_log_prob):
    val_bi_count = defaultdict(int)
    val_word_count = 0
    for word in val_bigram:
        val_bi_count[word] += 1
        val_word_count += 1

    sum_log_prob = 0
    for word in val_bi_count:
        ##condition for handling unknown bigrams
        if word not in train_log_prob:
            word_log_prob = train_log_prob.get(('unk', 'unk'), 0)
        else:
            word_log_prob = train_log_prob[word]
        sum_log_prob += (-1) * word_log_prob * val_bi_count[word]

    perplexity = math.exp(sum_log_prob / val_word_count)
    return perplexity

#Preprocessing
def preprocess_and_get_bigrams(text):
    text = re.sub(r'\W', ' ', text)
    tokens = text.lower().split()
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
    return tokens, bigrams

print("Libraries imported and helper functions defined.\n")

Libraries imported and helper functions defined.



##Reading and Preprocessing Training and Validation Data

In [10]:
def read_and_preprocess_data(train_file, val_file):
    #loading training data
    with open(train_file, 'r') as file:
        train_text = file.read()
    print("First 200 characters of Training Data:\n", train_text[:200], "\n")

    train_tokens, train_bigrams = preprocess_and_get_bigrams(train_text)

    #loading validation data
    with open(val_file, 'r') as file:
        val_text = file.read()
    print("First 200 characters of Validation Data:\n", val_text[:200], "\n")

    val_tokens, val_bigrams = preprocess_and_get_bigrams(val_text)

    #sample data display
    print("Training Data Tokens (Top 10):")
    for token in train_tokens[:10]:
        print(token)
    print("\nValidation Data Tokens (Top 10):")
    for token in val_tokens[:10]:
        print(token)

    print(f"Total number of tokens in Training Data: {len(train_tokens)}")
    print(f"Total number of tokens in Validation Data: {len(val_tokens)}")

    print(f"Number of unique tokens in Training Data: {len(set(train_tokens))}")

    return train_tokens, val_tokens, train_bigrams, val_bigrams

train_tokens, val_tokens, train_bigrams, val_bigrams = read_and_preprocess_data('train.txt', 'val.txt')
print("Training and validation data have been preprocessed and tokens extracted.\n")

First 200 characters of Training Data:
 I booked two rooms four months in advance at the Talbott . We were placed on the top floor next to the elevators , which are used all night long . When speaking to the front desk , I was told that the 

First 200 characters of Validation Data:
 I stayed for four nights while attending a conference . The hotel is in a great spot - easy walk to Michigan Ave shopping or Rush St. , but just off the busy streets . The room I had was spacious , an 

Training Data Tokens (Top 10):
i
booked
two
rooms
four
months
in
advance
at
the

Validation Data Tokens (Top 10):
i
stayed
for
four
nights
while
attending
a
conference
the
Total number of tokens in Training Data: 80300
Total number of tokens in Validation Data: 8835
Number of unique tokens in Training Data: 5962
Training and validation data have been preprocessed and tokens extracted.



##Calculating Unsmoothed Bigram Probabilities and Log Probabilities

In [11]:
def unsmoothed_bigram_probabilities(train_tokens, train_bigrams):
    print("\n--- Calculating Unsmoothed Bigram Probabilities ---")

    unigram_count, bigram_count = defaultdict(int), defaultdict(int)
    for token in train_tokens:
        unigram_count[token] += 1

    for bigram in train_bigrams:
        bigram_count[bigram] += 1

    bigram_prob = defaultdict(int)
    for bigram in bigram_count:
        bigram_prob[bigram] = bigram_count[bigram] / unigram_count[bigram[0]]

    print(f"Bigram Counts (Top 15):")
    for bigram, count in list(bigram_count.items())[:15]:
        print(f"{bigram}: {count}")

    print(f"\nUnsmoothed Bigram Probabilities (Top 15):")
    for bigram, prob in list(bigram_prob.items())[:15]:
        print(f"{bigram}: {prob:.6f}")

    bigram_log_prob = defaultdict(int)
    for bigram in bigram_prob:
        bigram_log_prob[bigram] = math.log(bigram_prob[bigram])

    print(f"\nLog Probabilities (Top 15):")
    for bigram, log_prob in list(bigram_log_prob.items())[:15]:
        print(f"{bigram}: {log_prob:.6f}")

    return bigram_prob, bigram_log_prob, unigram_count, bigram_count

#counting unigram count and bigram count in the main code block
bigram_prob, bigram_log_prob, unigram_count, bigram_count = unsmoothed_bigram_probabilities(train_tokens, train_bigrams)
print("Unsmoothed Bigram Probabilities and Log Probabilities calculated.\n")


#Calculating perplexity of training data without any smoothing
perplexity_unsmoothed = calculate_perplexity_bigram(bigram_prob, train_bigrams, bigram_log_prob)
results['Perplexity (Training Dataset Without Any Smoothing)'] = perplexity_unsmoothed
print(f"Perplexity (Training Dataset Without Any Smoothing): {perplexity_unsmoothed}\n")


--- Calculating Unsmoothed Bigram Probabilities ---
Bigram Counts (Top 15):
('i', 'booked'): 21
('booked', 'two'): 1
('two', 'rooms'): 3
('rooms', 'four'): 1
('four', 'months'): 1
('months', 'in'): 2
('in', 'advance'): 7
('advance', 'at'): 1
('at', 'the'): 335
('the', 'talbott'): 26
('talbott', 'we'): 2
('we', 'were'): 179
('were', 'placed'): 1
('placed', 'on'): 2
('on', 'the'): 234

Unsmoothed Bigram Probabilities (Top 15):
('i', 'booked'): 0.012195
('booked', 'two'): 0.011628
('two', 'rooms'): 0.023256
('rooms', 'four'): 0.004926
('four', 'months'): 0.047619
('months', 'in'): 0.250000
('in', 'advance'): 0.005311
('advance', 'at'): 0.142857
('at', 'the'): 0.449664
('the', 'talbott'): 0.004892
('talbott', 'we'): 0.071429
('we', 'were'): 0.160251
('were', 'placed'): 0.001730
('placed', 'on'): 0.250000
('on', 'the'): 0.363354

Log Probabilities (Top 15):
('i', 'booked'): -4.406719
('booked', 'two'): -4.454347
('two', 'rooms'): -3.761200
('rooms', 'four'): -5.313206
('four', 'months'): -

##Perplexity Calculations for Various Cases of Laplace Smoothing, Add-k smoothing with K=0.5 and K=3

##Smoothing and Perplexity Calculations Ignoring Unknown Words

In [12]:
# Laplace Smoothing
print("\n--- Adding Ignoring Unknown Words ---")
laplace_bigram_prob, laplace_bigram_log_prob = defaultdict(int), defaultdict(int)
for bigram in bigram_count:
    laplace_bigram_prob[bigram] = (bigram_count[bigram] + 1) / (unigram_count[bigram[0]] + len(unigram_count))
    laplace_bigram_log_prob[bigram] = math.log(laplace_bigram_prob[bigram])

perplexity_laplace = calculate_perplexity_bigram(laplace_bigram_prob, val_bigrams, laplace_bigram_log_prob)
results['Perplexity (Laplace Smoothing)'] = perplexity_laplace
print(f"Perplexity (Laplace Smoothing - Ignoring Unknown Words): {perplexity_laplace}")

# Add-k Smoothing where k = 0.5
k = 0.5
k_bigram_prob, k_bigram_log_prob = defaultdict(int), defaultdict(int)
for bigram in bigram_count:
    k_bigram_prob[bigram] = (bigram_count[bigram] + k) / (unigram_count[bigram[0]] + (k * len(unigram_count)))
    k_bigram_log_prob[bigram] = math.log(k_bigram_prob[bigram])

perplexity_add_k_0_5 = calculate_perplexity_bigram(k_bigram_prob, val_bigrams, k_bigram_log_prob)
results['Perplexity (Add-k Smoothing k=0.5)'] = perplexity_add_k_0_5
print(f"Perplexity (Add-k Smoothing, k=0.5 - Ignoring Unknown Words): {perplexity_add_k_0_5}")

# Add-k Smoothing where k = 3
k = 3
k_bigram_prob, k_bigram_log_prob = defaultdict(int), defaultdict(int)
for bigram in bigram_count:
    k_bigram_prob[bigram] = (bigram_count[bigram] + k) / (unigram_count[bigram[0]] + (k * len(unigram_count)))
    k_bigram_log_prob[bigram] = math.log(k_bigram_prob[bigram])

perplexity_add_k_3 = calculate_perplexity_bigram(k_bigram_prob, val_bigrams, k_bigram_log_prob)
results['Perplexity (Add-k Smoothing k=3)'] = perplexity_add_k_3
print(f"Perplexity (Add-k Smoothing, k=3 - Ignoring Unknown Words): {perplexity_add_k_3}\n")


--- Adding Ignoring Unknown Words ---
Perplexity (Laplace Smoothing - Ignoring Unknown Words): 60.17091971583495
Perplexity (Add-k Smoothing, k=0.5 - Ignoring Unknown Words): 43.7558536740361
Perplexity (Add-k Smoothing, k=3 - Ignoring Unknown Words): 97.7601456201918



##Smoothing and Perplexity Calculations Handling Unknown Words

In [13]:
# Adding the Unknown Word tag with frequency zero
print("\n--- Adding Unknown Word Tag with Zero Frequency ---")
lap_uni_count = defaultdict(int)
for word in train_tokens:
    lap_uni_count[word] += 1
lap_uni_count['unk'] = 0

# Replace rare words in training data with 'unk'
new_train_tokens = ['unk' if lap_uni_count[word] == 1 else word for word in train_tokens]
new_train_bigrams = [(new_train_tokens[i], new_train_tokens[i + 1]) for i in range(len(new_train_tokens) - 1)]

# Recomputing bigram counts with 'unk' handling
lap_bi_count = defaultdict(int)
for bigram in new_train_bigrams:
    lap_bi_count[bigram] += 1

lap_bi_train_prob, lap_bi_train_log_prob = defaultdict(int), defaultdict(int)
for bigram in lap_bi_count:
    lap_bi_train_prob[bigram] = (lap_bi_count[bigram] + 1) / (lap_uni_count[bigram[0]] + len(lap_uni_count))
    lap_bi_train_log_prob[bigram] = math.log(lap_bi_train_prob[bigram])

# Perplexity calculation with Laplace Smoothing for 'unk' handling
perplexity_laplace_unk = calculate_perplexity_bigram(lap_bi_train_prob, val_bigrams, lap_bi_train_log_prob)
results['Perplexity (Laplace Smoothing - Adding unk with Zero Frequency)'] = perplexity_laplace_unk
print(f"Perplexity (Laplace Smoothing - Adding 'unk' with Zero Frequency): {perplexity_laplace_unk}")

# Add-k Smoothing with 'unk' handling
k = 0.5
k_bi_train_prob = defaultdict(int)
for bigram in lap_bi_count:
    k_bi_train_prob[bigram] = (lap_bi_count[bigram] + k) / (lap_uni_count[bigram[0]] + (k * len(lap_uni_count)))

k_bi_train_log_prob = defaultdict(int)
for bigram in k_bi_train_prob:
    k_bi_train_log_prob[bigram] = math.log(k_bi_train_prob[bigram])

# Perplexity for Add-k Smoothing with 'unk' handling, k = 0.5
perplexity_add_k_0_5_unk = calculate_perplexity_bigram(k_bi_train_prob, val_bigrams, k_bi_train_log_prob)
results['Perplexity (Add-k Smoothing, k=0.5 - Adding unk with Zero Frequency)'] = perplexity_add_k_0_5_unk
print(f"Perplexity (Add-k Smoothing, k=0.5 - Adding 'unk' with Zero Frequency): {perplexity_add_k_0_5_unk}")

k = 3
k_bi_train_prob = defaultdict(int)
for bigram in lap_bi_count:
    k_bi_train_prob[bigram] = (lap_bi_count[bigram] + k) / (lap_uni_count[bigram[0]] + (k * len(lap_uni_count)))

k_bi_train_log_prob = defaultdict(int)
for bigram in k_bi_train_prob:
    k_bi_train_log_prob[bigram] = math.log(k_bi_train_prob[bigram])

# Perplexity for Add-k Smoothing with 'unk' handling, k = 3
perplexity_add_k_3_unk = calculate_perplexity_bigram(k_bi_train_prob, val_bigrams, k_bi_train_log_prob)
results['Perplexity (Add-k Smoothing, k=3 - Adding unk with Zero Frequency)'] = perplexity_add_k_3_unk
print(f"Perplexity (Add-k Smoothing, k=3 - Adding 'unk' with Zero Frequency): {perplexity_add_k_3_unk}\n")


--- Adding Unknown Word Tag with Zero Frequency ---
Perplexity (Laplace Smoothing - Adding 'unk' with Zero Frequency): 193.1745009886092
Perplexity (Add-k Smoothing, k=0.5 - Adding 'unk' with Zero Frequency): 110.32072881036434
Perplexity (Add-k Smoothing, k=3 - Adding 'unk' with Zero Frequency): 459.7445797802327



##Smoothing and Perplexity Calculations With Rare Words as Unknown

In [14]:
# Replacing Rare Words with Unknown Tag
print("\n--- Replacing Rare Words with Unknown Tag ---")
# Replace rare words with 'unk'
rare_words = [word for word in lap_uni_count if lap_uni_count[word] == 1]
new_train_tokens = [('unk' if token in rare_words else token) for token in train_tokens]
new_train_bigrams = [(new_train_tokens[i], new_train_tokens[i + 1]) for i in range(len(new_train_tokens) - 1)]

# Recompute counts for training data with 'unk' handling
new_unigram_count = defaultdict(int)
for token in new_train_tokens:
    new_unigram_count[token] += 1

new_bigram_count = defaultdict(int)
for bigram in new_train_bigrams:
    new_bigram_count[bigram] += 1

# Calculate Laplace Smoothing for new data with 'unk'
laplace_new_bigram_prob = defaultdict(int)
laplace_new_bigram_log_prob = defaultdict(int)
for bigram in new_bigram_count:
    laplace_new_bigram_prob[bigram] = (new_bigram_count[bigram] + 1) / (new_unigram_count[bigram[0]] + len(new_unigram_count))
    laplace_new_bigram_log_prob[bigram] = math.log(laplace_new_bigram_prob[bigram])

# Perplexity for Laplace Smoothing with Rare Words as 'unk'
perplexity_laplace_rare = calculate_perplexity_bigram(laplace_new_bigram_prob, val_bigrams, laplace_new_bigram_log_prob)
results['Perplexity (Laplace Smoothing with Rare Words as Unknown)'] = perplexity_laplace_rare
print(f"Perplexity (Laplace Smoothing with Rare Words as Unknown): {perplexity_laplace_rare}")

# Add-k Smoothing with 'unk' handling for rare words and k = 0.5
k = 0.5
k_rare_bigram_prob = defaultdict(int)  # Initialize bigram probability dictionary
k_rare_bigram_log_prob = defaultdict(int)  # Initialize log probability dictionary

for bigram in new_bigram_count:
    k_rare_bigram_prob[bigram] = (new_bigram_count[bigram] + k) / (new_unigram_count[bigram[0]] + (k * len(new_unigram_count)))
    k_rare_bigram_log_prob[bigram] = math.log(k_rare_bigram_prob[bigram])

# Perplexity for Add-k Smoothing (k = 0.5) with Rare Words as 'unk'
perplexity_add_k_0_5_rare = calculate_perplexity_bigram(k_rare_bigram_prob, val_bigrams, k_rare_bigram_log_prob)
results['Perplexity (Add-k Smoothing, k=0.5 - Rare Words as Unknown)'] = perplexity_add_k_0_5_rare
print(f"Perplexity (Add-k Smoothing, k=0.5 - Rare Words as Unknown): {perplexity_add_k_0_5_rare}")

# Add-k Smoothing with 'unk' handling for rare words and k = 3
k = 3
k_rare_bigram_prob = defaultdict(int)  # Initializing bigram probability dictionary
k_rare_bigram_log_prob = defaultdict(int)  # Initializing log probability dictionary

for bigram in new_bigram_count:
    k_rare_bigram_prob[bigram] = (new_bigram_count[bigram] + k) / (new_unigram_count[bigram[0]] + (k * len(new_unigram_count)))
    k_rare_bigram_log_prob[bigram] = math.log(k_rare_bigram_prob[bigram])

# Perplexity for Add-k Smoothing (k = 3) with Rare Words as 'unk'
perplexity_add_k_3_rare = calculate_perplexity_bigram(k_rare_bigram_prob, val_bigrams, k_rare_bigram_log_prob)
results['Perplexity (Add-k Smoothing, k=3 - Rare Words as Unknown)'] = perplexity_add_k_3_rare
print(f"Perplexity (Add-k Smoothing, k=3 - Rare Words as Unknown): {perplexity_add_k_3_rare}")


--- Replacing Rare Words with Unknown Tag ---
Perplexity (Laplace Smoothing with Rare Words as Unknown): 135.37235609080957
Perplexity (Add-k Smoothing, k=0.5 - Rare Words as Unknown): 91.95803128753613
Perplexity (Add-k Smoothing, k=3 - Rare Words as Unknown): 270.69792197096535


##Final Step - Display All Results

In [15]:
# Displaying all the results stored in the results dictionary
print("Final Step: Displaying results for all cases (Laplace, Add-k=0.5, Add-k=3, No Smoothing)")
print("-------------------------")
for key, value in results.items():
    print(f"{key}: {value}")

print("\nAll results have been displayed.")


Final Step: Displaying results for all cases (Laplace, Add-k=0.5, Add-k=3, No Smoothing)
-------------------------
Perplexity (Training Dataset Without Any Smoothing): 30.21189738227987
Perplexity (Laplace Smoothing): 60.17091971583495
Perplexity (Add-k Smoothing k=0.5): 43.7558536740361
Perplexity (Add-k Smoothing k=3): 97.7601456201918
Perplexity (Laplace Smoothing - Adding unk with Zero Frequency): 193.1745009886092
Perplexity (Add-k Smoothing, k=0.5 - Adding unk with Zero Frequency): 110.32072881036434
Perplexity (Add-k Smoothing, k=3 - Adding unk with Zero Frequency): 459.7445797802327
Perplexity (Laplace Smoothing with Rare Words as Unknown): 135.37235609080957
Perplexity (Add-k Smoothing, k=0.5 - Rare Words as Unknown): 91.95803128753613
Perplexity (Add-k Smoothing, k=3 - Rare Words as Unknown): 270.69792197096535

All results have been displayed.
