# CS6320 - NLP - Assignment 1 - Group 14
1. Sai Vikas Thiruveedula (SXT230026)
2. Vijay Sai Dukkipati (DXV220040)
3. Manikanta Sai Kommireddy (MXK220132)

##Importing the Dataset

In [13]:
import requests
url="https://raw.githubusercontent.com/saivikas10/Assignment/refs/heads/main/A1_DATASET/train.txt"
response = requests.get(url)


with open('train.txt', 'w') as file:
    file.write(response.text)

url="https://raw.githubusercontent.com/saivikas10/Assignment/refs/heads/main/A1_DATASET/val.txt"
response = requests.get(url)


with open('val.txt', 'w') as file:
    file.write(response.text)

##Import Libraries and Define Helper Functions

In [14]:
import re
import math
from collections import defaultdict, Counter

# Global dictionary for storing the results and to display at last
results = {}

#Perplexity calculation function
def calculate_perplexity(train_prob, val_token, train_log_prob):
    val_uni_count = defaultdict(int)
    val_word_count = 0
    for word in val_token:
        val_uni_count[word] += 1
        val_word_count += 1

    sum_log_prob = 0
    for word in val_uni_count:
        #condition for handling unknown words
        if word not in train_log_prob:
            word_log_prob = train_log_prob['unk']
        else:
            word_log_prob = train_log_prob[word]
        sum_log_prob += (-1) * word_log_prob * val_uni_count[word]

    perplexity = math.exp(sum_log_prob / val_word_count)
    return perplexity

#add_k_smoothing function for computing probabilities
def add_k_smoothing(train_token, k):
    k_uni_count, k_uni_word_count = defaultdict(int), 0
    for word in train_token:
        k_uni_count[word] += 1
        k_uni_word_count += 1

    k_uni_train_prob = defaultdict(int)
    for word in k_uni_count:
        k_uni_train_prob[word] = (k_uni_count[word] + k) / (k_uni_word_count + k * len(k_uni_count))

    return k_uni_train_prob, k_uni_count, k_uni_word_count

print("Libraries imported and helper functions defined.\n")


Libraries imported and helper functions defined.



##Reading and Preprocessing Data

In [15]:
def read_and_preprocess_data(train_file, val_file):
    print("-------------------------")
    print("Step 1: Reading and displaying the first 200 characters of training and validation data")
    print("-------------------------")

    #loading training data
    with open(train_file, 'r') as file:
        train = file.read()
    print("First 200 characters of Training Data:\n", train[:200], "\n")

    train = re.sub(r'\W', ' ', train)
    train_token = train.lower().split()

    #loading validation data
    with open(val_file, 'r') as file:
        val = file.read()
    print("First 200 characters of Validation Data:\n", val[:200], "\n")

    val = re.sub(r'\W', ' ', val)
    val_token = val.lower().split()

    #sample data display
    print("Training Data Tokens (Top 10):")
    for token in train_token[:10]:
        print(token)
    print("\nValidation Data Tokens (Top 10):")
    for token in val_token[:10]:
        print(token)

    #calculating total number of tokens in training and validation data
    print(f"Total number of tokens in Training Data: {len(train_token)}")
    print(f"Total number of tokens in Validation Data: {len(val_token)}")

    #calculating the total number of unique tokens
    print(f"Number of unique tokens in Training Data: {len(set(train_token))}")

    return train_token, val_token

#Preprocessing
train_token, val_token = read_and_preprocess_data('train.txt', 'val.txt')
print("Training and validation data have been preprocessed and tokens extracted.\n")

-------------------------
Step 1: Reading and displaying the first 200 characters of training and validation data
-------------------------
First 200 characters of Training Data:
 I booked two rooms four months in advance at the Talbott . We were placed on the top floor next to the elevators , which are used all night long . When speaking to the front desk , I was told that the 

First 200 characters of Validation Data:
 I stayed for four nights while attending a conference . The hotel is in a great spot - easy walk to Michigan Ave shopping or Rush St. , but just off the busy streets . The room I had was spacious , an 

Training Data Tokens (Top 10):
i
booked
two
rooms
four
months
in
advance
at
the

Validation Data Tokens (Top 10):
i
stayed
for
four
nights
while
attending
a
conference
the
Total number of tokens in Training Data: 80300
Total number of tokens in Validation Data: 8835
Number of unique tokens in Training Data: 5962
Training and validation data have been preprocessed and to

##Calculating Unsmoothed Unigram Probabilities and Log Probabilities

In [16]:
def unsmoothed_unigram_probabilities(train_token):
    print("\n--- Calculating Unsmoothed Unigram Probabilities ---")

    #counting the word frequencies and calculating probabilities
    un_uni_count, un_uni_word_count = defaultdict(int), 0
    for word in train_token:
        un_uni_count[word] += 1
        un_uni_word_count += 1
    print(f"Total Words (Training Data): {un_uni_word_count}")
    print(f"Unique Words (Training Data): {len(un_uni_count)}")

    print(f"Word Counts (Training Data, Top 15):")
    for word, count in list(un_uni_count.items())[:15]:
        print(f"{word}: {count}")

    un_uni_train_prob = defaultdict(int)
    for word in un_uni_count:
        un_uni_train_prob[word] = un_uni_count[word] / un_uni_word_count

    print(f"\nUnsmoothed Unigram Probabilities (Top 15):")
    for word, prob in list(un_uni_train_prob.items())[:15]:
        print(f"{word}: {prob:.6f}")

    freq_distribution = Counter(train_token)
    top_10_prob = {word: un_uni_train_prob[word] for word, _ in freq_distribution.most_common(10)}
    print(f"\nProbabilities of Top 10 Most Frequent Words:")
    for word, prob in top_10_prob.items():
        print(f"{word}: {prob:.6f}")

    # calculating Log of probabilities
    un_uni_train_log_prob = defaultdict(int)
    for word in un_uni_train_prob:
        un_uni_train_log_prob[word] = math.log(un_uni_train_prob[word])

    print(f"\nLog Probabilities (Top 15):")
    for word, log_prob in list(un_uni_train_log_prob.items())[:15]:
        print(f"{word}: {log_prob:.6f}")

    return un_uni_train_prob, un_uni_train_log_prob

# Running Unsmoothed Unigram Model and Perplexity Calculation
un_uni_train_prob, un_uni_train_log_prob = unsmoothed_unigram_probabilities(train_token)
print("Unsmoothed Unigram Probabilities and Log Probabilities calculated.\n")

# Running Perplexity of Training Dataset Without Any Smoothing
perplexity_unsmoothed = calculate_perplexity(un_uni_train_prob, train_token, un_uni_train_log_prob)
results['Perplexity (Training Dataset Without Any Smoothing)'] = perplexity_unsmoothed
print(f"Perplexity (Training Dataset Without Any Smoothing): {perplexity_unsmoothed}\n")


--- Calculating Unsmoothed Unigram Probabilities ---
Total Words (Training Data): 80300
Unique Words (Training Data): 5962
Word Counts (Training Data, Top 15):
i: 1722
booked: 86
two: 129
rooms: 203
four: 21
months: 8
in: 1318
advance: 7
at: 745
the: 5315
talbott: 28
we: 1117
were: 578
placed: 8
on: 644

Unsmoothed Unigram Probabilities (Top 15):
i: 0.021445
booked: 0.001071
two: 0.001606
rooms: 0.002528
four: 0.000262
months: 0.000100
in: 0.016413
advance: 0.000087
at: 0.009278
the: 0.066189
talbott: 0.000349
we: 0.013910
were: 0.007198
placed: 0.000100
on: 0.008020

Probabilities of Top 10 Most Frequent Words:
the: 0.066189
and: 0.032316
a: 0.028057
to: 0.026027
was: 0.022740
i: 0.021445
in: 0.016413
we: 0.013910
of: 0.013051
hotel: 0.012914

Log Probabilities (Top 15):
i: -3.842283
booked: -6.839178
two: -6.433712
rooms: -5.980319
four: -8.249002
months: -9.214083
in: -4.109654
advance: -9.347615
at: -4.680141
the: -2.715237
talbott: -7.961320
we: -4.275123
were: -4.933951
placed: 

##Log Probability Distribution for Different Models

In [17]:
print("\nLog Probability Distribution (Unsmoothed Model - Top 10):")
for word, log_prob in list(un_uni_train_log_prob.items())[:10]:
    print(f"{word}: {log_prob:.6f}")

#calculating Laplace smoothed log probabilities for the training data
lap_uni_count = defaultdict(int)
lap_uni_word_count = 0
for word in train_token:
    lap_uni_count[word] += 1
    lap_uni_word_count += 1

lap_uni_train_prob = defaultdict(int)
for word in lap_uni_count:
    lap_uni_train_prob[word] = (lap_uni_count[word] + 1) / (lap_uni_word_count + len(lap_uni_count))

lap_uni_train_log_prob = defaultdict(int)
for word in lap_uni_train_prob:
    lap_uni_train_log_prob[word] = math.log(lap_uni_train_prob[word])

print("\nLog Probability Distribution (Laplace Smoothing - Top 10):")
for word, log_prob in list(lap_uni_train_log_prob.items())[:10]:
    print(f"{word}: {log_prob:.6f}")

# Calculating Add-k where k=0.5 smoothed log probabilities for the training set
k = 0.5
k_uni_train_prob, k_uni_count, k_uni_word_count = add_k_smoothing(train_token, k)
k_uni_train_log_prob = defaultdict(int)
for word in k_uni_train_prob:
    k_uni_train_log_prob[word] = math.log(k_uni_train_prob[word])

print("\nLog Probability Distribution (Add-k Smoothing, k=0.5 - Top 10):")
for word, log_prob in list(k_uni_train_log_prob.items())[:10]:
    print(f"{word}: {log_prob:.6f}")

# Calculating Add-k where k=3 smoothed log probabilities for the training set
k = 3
k_uni_train_prob, k_uni_count, k_uni_word_count = add_k_smoothing(train_token, k)
k_uni_train_log_prob = defaultdict(int)
for word in k_uni_train_prob:
    k_uni_train_log_prob[word] = math.log(k_uni_train_prob[word])

print("\nLog Probability Distribution (Add-k Smoothing, k=3 - Top 10):")
for word, log_prob in list(k_uni_train_log_prob.items())[:10]:
    print(f"{word}: {log_prob:.6f}")

print("Log probability distributions for all models displayed.\n")



Log Probability Distribution (Unsmoothed Model - Top 10):
i: -3.842283
booked: -6.839178
two: -6.433712
rooms: -5.980319
four: -8.249002
months: -9.214083
in: -4.109654
advance: -9.347615
at: -4.680141
the: -2.715237

Log Probability Distribution (Laplace Smoothing - Top 10):
i: -3.913322
booked: -6.899236
two: -6.497610
rooms: -6.047024
four: -8.274102
months: -9.167920
in: -4.180515
advance: -9.285703
at: -4.750419
the: -2.786668

Log Probability Distribution (Add-k Smoothing, k=0.5 - Top 10):
i: -3.878444
booked: -6.869831
two: -6.466295
rooms: -6.014310
four: -8.261923
months: -9.189910
in: -4.145726
advance: -9.315073
at: -4.715921
the: -2.751593

Log Probability Distribution (Add-k Smoothing, k=3 - Top 10):
i: -4.041637
booked: -7.005983
two: -6.611817
rooms: -6.166743
four: -8.316565
months: -9.096724
in: -4.308475
advance: -9.192034
at: -4.877216
the: -2.915766
Log probability distributions for all models displayed.



##Add-k Smoothing and Perplexity Calculation

In [18]:
# Perplexity of Training Dataset Using Laplace Smoothing Without Unknown Word Handling
perplexity_laplace_without_unknown = calculate_perplexity(lap_uni_train_prob, train_token, lap_uni_train_log_prob)
results['Perplexity (Laplace Smoothing Without Unknown)'] = perplexity_laplace_without_unknown
print(f"Perplexity (Laplace Smoothing Without Unknown): {perplexity_laplace_without_unknown}")

# Unigram Add-k Smoothing for k=0.5 without Unknown Word Handling
k = 0.5
k_uni_train_prob, k_uni_count, k_uni_word_count = add_k_smoothing(train_token, k)
k_uni_train_log_prob = defaultdict(int)
for word in k_uni_train_prob:
    k_uni_train_log_prob[word] = math.log(k_uni_train_prob[word])
perplexity_add_k_0_5 = calculate_perplexity(k_uni_train_prob, val_token, k_uni_train_log_prob)
results['Perplexity (Add-k Smoothing k=0.5 without Unknown)'] = perplexity_add_k_0_5
print(f"Perplexity (Add-k Smoothing k=0.5 without Unknown): {perplexity_add_k_0_5}")

# Unigram Add-k Smoothing for k=3 without Unknown Word Handling
k = 3
k_uni_train_prob, k_uni_count, k_uni_word_count = add_k_smoothing(train_token, k)
k_uni_train_log_prob = defaultdict(int)
for word in k_uni_train_prob:
    k_uni_train_log_prob[word] = math.log(k_uni_train_prob[word])
perplexity_add_k_3 = calculate_perplexity(k_uni_train_prob, val_token, k_uni_train_log_prob)
results['Perplexity (Add-k Smoothing k=3 without Unknown)'] = perplexity_add_k_3
print(f"Perplexity (Add-k Smoothing k=3 without Unknown): {perplexity_add_k_3}\n")

Perplexity (Laplace Smoothing Without Unknown): 526.2184224551577
Perplexity (Add-k Smoothing k=0.5 without Unknown): 383.25196453239204
Perplexity (Add-k Smoothing k=3 without Unknown): 413.50606773548816



##Laplace and Add-k Smoothing with Unknown Word Handling

In [19]:
# Laplace Smoothing with Unknown Word Handling
lap_uni_count['unk'] = 0
lap_uni_train_prob['unk'] = (lap_uni_count['unk'] + 1) / (lap_uni_word_count + len(lap_uni_count))
lap_uni_train_log_prob['unk'] = math.log(lap_uni_train_prob['unk'])
perplexity_laplace_unknown_handling = calculate_perplexity(lap_uni_train_prob, val_token, lap_uni_train_log_prob)
results['Perplexity (Laplace Smoothing with Unknown)'] = perplexity_laplace_unknown_handling
print(f"Perplexity (Laplace Smoothing with Unknown): {perplexity_laplace_unknown_handling}")

# Add-k Smoothing for k=0.5 with Unknown Word Handling
k = 0.5
k_uni_count['unk'] = 0
k_uni_train_prob['unk'] = (k_uni_count['unk'] + k) / (k_uni_word_count + (k * len(k_uni_count)))
k_uni_train_log_prob['unk'] = math.log(k_uni_train_prob['unk'])
perplexity_add_k_0_5_unknown = calculate_perplexity(k_uni_train_prob, val_token, k_uni_train_log_prob)
results['Perplexity (Add-k Smoothing k=0.5 with Unknown)'] = perplexity_add_k_0_5_unknown
print(f"Perplexity (Add-k Smoothing k=0.5 with Unknown): {perplexity_add_k_0_5_unknown}")

# Add-k Smoothing for k=3 with Unknown Word Handling
k = 3
k_uni_count['unk'] = 0
k_uni_train_prob['unk'] = (k_uni_count['unk'] + k) / (k_uni_word_count + (k * len(k_uni_count)))
k_uni_train_log_prob['unk'] = math.log(k_uni_train_prob['unk'])
perplexity_add_k_3_unknown = calculate_perplexity(k_uni_train_prob, val_token, k_uni_train_log_prob)
results['Perplexity (Add-k Smoothing k=3 with Unknown)'] = perplexity_add_k_3_unknown
print(f"Perplexity (Add-k Smoothing k=3 with Unknown): {perplexity_add_k_3_unknown}")

Perplexity (Laplace Smoothing with Unknown): 539.9572567347413
Perplexity (Add-k Smoothing k=0.5 with Unknown): 585.8421256872612
Perplexity (Add-k Smoothing k=3 with Unknown): 558.862953194595


##Smoothing with Rare Words Tagged as Unknown

In [20]:
# Unigram Laplace Smoothing with Rare Words as Unknown
new_uni_count = defaultdict(int)
new_uni_word_count = 0
new_uni_count['unk'] = 0
for word in train_token:
    #Tagging rare words as unk
    if lap_uni_count[word] == 1:
        new_uni_count['unk'] += 1
    else:
        new_uni_count[word] += 1
    new_uni_word_count += 1

lap_uni_train_prob = defaultdict(int)
for word in new_uni_count:
    lap_uni_train_prob[word] = (new_uni_count[word] + 1) / (new_uni_word_count + len(new_uni_count))

lap_uni_train_log_prob = defaultdict(int)
for word in lap_uni_train_prob:
    lap_uni_train_log_prob[word] = math.log(lap_uni_train_prob[word])
lap_uni_train_log_prob['unk'] = math.log(lap_uni_train_prob['unk'])

perplexity_laplace_rare_words = calculate_perplexity(lap_uni_train_prob, val_token, lap_uni_train_log_prob)
results['Perplexity (Laplace Smoothing with Rare Words as Unknown)'] = perplexity_laplace_rare_words
print(f"Perplexity (Laplace Smoothing with Rare Words as Unknown): {perplexity_laplace_rare_words}")

# Add-k Smoothing for k=0.5 with Rare Words as Unknown
k = 0.5
k_uni_train_prob = defaultdict(int)
for word in new_uni_count:
    k_uni_train_prob[word] = (new_uni_count[word] + k) / (new_uni_word_count + (k * len(new_uni_count)))

k_uni_train_log_prob = defaultdict(int)
for word in k_uni_train_prob:
    k_uni_train_log_prob[word] = math.log(k_uni_train_prob[word])

perplexity_add_k_0_5_rare = calculate_perplexity(k_uni_train_prob, val_token, k_uni_train_log_prob)
results['Perplexity (Add-k Smoothing k=0.5 with Rare Words as Unknown)'] = perplexity_add_k_0_5_rare
print(f"Perplexity (Add-k Smoothing k=0.5 with Rare Words as Unknown): {perplexity_add_k_0_5_rare}")

# Add-k Smoothing for k=3 with Rare Words as Unknown
k = 3
k_uni_train_prob = defaultdict(int)
for word in new_uni_count:
    k_uni_train_prob[word] = (new_uni_count[word] + k) / (new_uni_word_count + (k * len(new_uni_count)))

k_uni_train_log_prob = defaultdict(int)
for word in k_uni_train_prob:
    k_uni_train_log_prob[word] = math.log(k_uni_train_prob[word])

perplexity_add_k_3_rare = calculate_perplexity(k_uni_train_prob, val_token, k_uni_train_log_prob)
results['Perplexity (Add-k Smoothing k=3 with Rare Words as Unknown)'] = perplexity_add_k_3_rare
print(f"Perplexity (Add-k Smoothing k=3 with Rare Words as Unknown): {perplexity_add_k_3_rare}\n")

Perplexity (Laplace Smoothing with Rare Words as Unknown): 349.0340273743812
Perplexity (Add-k Smoothing k=0.5 with Rare Words as Unknown): 347.33315472616965
Perplexity (Add-k Smoothing k=3 with Rare Words as Unknown): 357.80478479349983



##Final Step - Displaying All Results

In [21]:
print("Final Step: Displaying results for all cases (Laplace, Add-k=0.5, Add-k=3)")
print("-------------------------")
for key, value in results.items():
    print(f"{key}: {value}")

print("\nAll results have been displayed.")


Final Step: Displaying results for all cases (Laplace, Add-k=0.5, Add-k=3)
-------------------------
Perplexity (Training Dataset Without Any Smoothing): 519.796634745634
Perplexity (Laplace Smoothing Without Unknown): 526.2184224551577
Perplexity (Add-k Smoothing k=0.5 without Unknown): 383.25196453239204
Perplexity (Add-k Smoothing k=3 without Unknown): 413.50606773548816
Perplexity (Laplace Smoothing with Unknown): 539.9572567347413
Perplexity (Add-k Smoothing k=0.5 with Unknown): 585.8421256872612
Perplexity (Add-k Smoothing k=3 with Unknown): 558.862953194595
Perplexity (Laplace Smoothing with Rare Words as Unknown): 349.0340273743812
Perplexity (Add-k Smoothing k=0.5 with Rare Words as Unknown): 347.33315472616965
Perplexity (Add-k Smoothing k=3 with Rare Words as Unknown): 357.80478479349983

All results have been displayed.
