In [2]:
%pip install contractions

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import contractions
import string
import re
from collections import defaultdict

# Additional function to manually handle certain contractions not expanded by the library
def manual_contractions_fix(text):
    # Replace 've, 'm, and others that are missed
    text = text.replace("'ve", " have")
    text = text.replace("'m", " am")
    text = text.replace("'ll", " will")
    text = text.replace("'d", " would")
    text = text.replace("'re", " are")
    
    return text

def preprocess(text):
    # Expand contractions using the library and manually handle others
    expanded_text = contractions.fix(text)
    expanded_text = manual_contractions_fix(expanded_text)
    
    # Remove numbers
    text_no_numbers = re.sub(r'\d+', '', expanded_text)
    
    # Remove punctuation, but keep useful tokens like periods for sentence boundaries
    translator = str.maketrans('', '', string.punctuation.replace('.', ''))  # Remove punctuation except periods
    cleaned_text = text_no_numbers.translate(translator)
    
    # Lowercase the text and remove extra spaces
    cleaned_text = cleaned_text.lower().strip()
    
    return cleaned_text.split()
# Load and preprocess the training data
def load_data(file_path):
    with open(file_path, 'r') as f:
        return [preprocess(line.strip()) for line in f]

# Unigram and bigram counts
def compute_ngram_counts(reviews):
    unigram_counts = defaultdict(int)
    bigram_counts = defaultdict(lambda: defaultdict(int))
    total_word_count = 0

    for review in reviews:
        total_word_count += len(review)
        
        # Unigram counts
        for word in review:
            unigram_counts[word] += 1

        # Bigram counts
        for i in range(1, len(review)):
            prev_word = review[i - 1]
            curr_word = review[i]
            bigram_counts[prev_word][curr_word] += 1

    return unigram_counts, bigram_counts, total_word_count

# Convert counts to probabilities (unsmoothed unigram and bigram)
def compute_ngram_probabilities(unigram_counts, bigram_counts, total_word_count):
    unigram_probabilities = {word: count / total_word_count for word, count in unigram_counts.items()}
    bigram_probabilities = {word: {next_word: count / unigram_counts[word] for next_word, count in next_words.items()} for word, next_words in bigram_counts.items()}
    
    return unigram_probabilities, bigram_probabilities

# Load the dataset
train_file_path = 'C:/Users/sxs230164/OneDrive - The University of Texas at Dallas/UTD FALL24/NLP/ass1/A1_DATASET/A1_DATASET/train.txt'  # Replace with the actual path
train_reviews = load_data(train_file_path)

# Compute n-gram counts
unigram_counts, bigram_counts, total_word_count = compute_ngram_counts(train_reviews)

# Compute n-gram probabilities
unigram_probabilities, bigram_probabilities = compute_ngram_probabilities(unigram_counts, bigram_counts, total_word_count)

# Output a small sample of unigram and bigram probabilities
print("Unigram Probabilities Sample:", dict(list(unigram_probabilities.items())[:5]))
print("Bigram Probabilities Sample:", {word: next_words for word, next_words in list(bigram_probabilities.items())[:5]})


Unigram Probabilities Sample: {'i': 0.020486264754774747, 'booked': 0.0010284989894399465, 'two': 0.0015307891935850365, 'rooms': 0.002415776696126386, 'four': 0.00023918581149766195}
Bigram Probabilities Sample: {'i': {'booked': 0.012259194395796848, 'was': 0.10799766491535318, 'had': 0.0642148277875073, 'am': 0.04786923525977817, 'spoke': 0.0011675423234092236, 'loved': 0.009340338587273789, 'love': 0.004086398131932283, 'really': 0.005837711617046118, 'have': 0.07764156450671336, 'preferred': 0.0005837711617046118, 'would': 0.08056042031523643, 'can': 0.01284296555750146, 'asked': 0.010507880910683012, 'think': 0.015178050204319907, 'came': 0.002335084646818447, 'noticed': 0.005253940455341506, 'were': 0.0035026269702276708, 'immediately': 0.0011675423234092236, 'called': 0.018680677174547577, 'went': 0.0070052539404553416, 'couple': 0.0005837711617046118, 'took': 0.004670169293636894, 'checked': 0.008172796263864565, 'showed': 0.0005837711617046118, 'did': 0.02626970227670753, 'rec

In [4]:
import string
import re
from collections import defaultdict

def handle_unknown_words(reviews, vocab, threshold=1):
    # Replace words that appear less than `threshold` times with <UNK>
    processed_reviews = []
    for review in reviews:
        new_review = [word if vocab[word] > threshold else '<UNK>' for word in review]
        processed_reviews.append(new_review)
    return processed_reviews

# Laplace (Add-1) smoothing function
def laplace_smoothing(unigram_counts, bigram_counts, vocab_size, alpha=0.7):
    smoothed_bigram_probs = defaultdict(lambda: defaultdict(float))
    for prev_word in bigram_counts:
        for curr_word in bigram_counts[prev_word]:
            smoothed_bigram_probs[prev_word][curr_word] = (
                (bigram_counts[prev_word][curr_word] + alpha) /
                (unigram_counts[prev_word] + alpha * vocab_size)
            )
    return smoothed_bigram_probs

def add_k_smoothing(unigram_counts, bigram_counts, vocab_size, k=0.7):
    smoothed_bigram_probs = defaultdict(dict)
    
    # Use the keys of unigram_counts as the vocabulary
    vocab = unigram_counts.keys()

    for w1 in unigram_counts:
        for w2 in vocab:
            bigram_count = bigram_counts[w1].get(w2, 0)
            smoothed_bigram_probs[w1][w2] = (bigram_count + k) / (unigram_counts[w1] + k * vocab_size)
    
    return smoothed_bigram_probs

def main():
    # Load the dataset and preprocess
    train_file_path = 'C:/Users/sxs230164/OneDrive - The University of Texas at Dallas/UTD FALL24/NLP/ass1/A1_DATASET/A1_DATASET/train.txt'  # Replace with the actual path
    train_reviews = load_data(train_file_path)

    # Compute n-gram counts on the original data
    unigram_counts, bigram_counts, total_word_count = compute_ngram_counts(train_reviews)

    # Handle unknown words
    processed_reviews = handle_unknown_words(train_reviews, unigram_counts)
    
    # Recompute counts for processed reviews (if necessary)
    unigram_counts, bigram_counts, total_word_count = compute_ngram_counts(processed_reviews)

    # Vocabulary size (number of unique words)
    vocab_size = len(unigram_counts)

    # Compute smoothed bigram probabilities using Laplace and Add-k smoothing
    laplace_smoothed_probs = laplace_smoothing(unigram_counts, bigram_counts, vocab_size)
    add_k_smoothed_probs = add_k_smoothing(unigram_counts, bigram_counts, vocab_size, k=0.11)

    # Sample output of smoothed probabilities
    print("Sample Laplace Smoothed Bigram Probabilities:", {word: dict(next_words) for word, next_words in list(laplace_smoothed_probs.items())[:5]})
    print("Sample Add-k Smoothed Bigram Probabilities:", {word: dict(next_words) for word, next_words in list(add_k_smoothed_probs.items())[:5]})
    
    # Optionally print total word count
    print(f"Total word count: {total_word_count}")

if __name__ == "__main__":
    main()


Sample Laplace Smoothed Bigram Probabilities: {'i': {'booked': 0.005727860630856539, 'was': 0.049016761251154804, 'had': 0.029220007918701333, 'am': 0.021829220007918704, 'spoke': 0.0007126831199683253, 'loved': 0.004408077075359641, 'love': 0.002032466675465224, 'really': 0.0028243368087633625, 'have': 0.035291012273987066, 'preferred': 0.00044872640886894547, 'would': 0.03661079582948396, 'can': 0.005991817341955919, 'asked': 0.0049359904975584, 'think': 0.0070476441863534375, 'came': 0.0012405965421670847, 'noticed': 0.002560380097663983, 'were': 0.001768509964365844, 'immediately': 0.0007126831199683253, 'called': 0.008631384452949717, 'went': 0.003352250230962122, 'couple': 0.00044872640886894547, 'took': 0.0022964233865646034, 'checked': 0.0038801636531608816, 'showed': 0.00044872640886894547, 'did': 0.012062821697241653, 'recently': 0.001768509964365844, 'stay': 0.002032466675465224, 'got': 0.0073116008974528175, 'stayed': 0.01470238880823545, 'will': 0.01496634551933483, 'menti

In [7]:
import math
from collections import defaultdict

# Filter vocabulary by minimum frequency
def filter_vocabulary(unigram_counts, min_freq=2):
    filtered_vocab = {word for word, count in unigram_counts.items() if count >= min_freq}
    return filtered_vocab

# Function to handle unknown words based on filtered vocabulary
def handle_unknown_words(reviews, filtered_vocab, unknown_token="<UNK>"):
    processed_reviews = []
    for review in reviews:
        processed_review = [word if word in filtered_vocab else unknown_token for word in review]
        processed_reviews.append(processed_review)
    return processed_reviews

# # Function to compute perplexity using the smoothed bigram model
# def calculate_perplexity(validation_reviews, smoothed_bigram_probs, unigram_counts, vocab_size, unknown_token="<UNK>"):
#     N = sum(len(review) for review in validation_reviews)
#     log_prob_sum = 0
    
#     for review in validation_reviews:
#         for i in range(1, len(review)):
#             prev_word = review[i - 1]
#             curr_word = review[i]
            
#             if curr_word not in unigram_counts:
#                 curr_word = unknown_token
#             if prev_word not in unigram_counts:
#                 prev_word = unknown_token

#             # Get smoothed probability, with a small default for unseen bigrams
#             prob = smoothed_bigram_probs[prev_word].get(curr_word, 1 / (unigram_counts[prev_word] + vocab_size))

#             log_prob_sum += math.log(prob)

#     # Calculate perplexity using base e (natural log)
#     perplexity = math.exp(-log_prob_sum / N)
#     return perplexity

# # Function to compute bigram and unigram counts
# def compute_ngram_counts(reviews):
#     unigram_counts = defaultdict(int)
#     bigram_counts = defaultdict(lambda: defaultdict(int))

#     for review in reviews:
#         for i in range(len(review)):
#             unigram_counts[review[i]] += 1
#             if i > 0:
#                 bigram_counts[review[i - 1]][review[i]] += 1

#     total_word_count = sum(unigram_counts.values())  # Total word count in corpus
#     return unigram_counts, bigram_counts, total_word_count

# # Laplace smoothing function
# def laplace_smoothing(unigram_counts, bigram_counts, vocab_size, alpha=1):
#     smoothed_bigram_probs = defaultdict(lambda: defaultdict(float))

#     for prev_word in bigram_counts:
#         for curr_word in bigram_counts[prev_word]:
#             smoothed_bigram_probs[prev_word][curr_word] = (
#                 (bigram_counts[prev_word][curr_word] + alpha) /
#                 (unigram_counts[prev_word] + alpha * vocab_size)
#             )

#     return smoothed_bigram_probs

# # Example usage
# def main():
#     # Assuming previous functions such as load_data and preprocess are defined
#     train_file_path = 'C:/Users/sxs230164/OneDrive - The University of Texas at Dallas/UTD FALL24/NLP/ass1/A1_DATASET/A1_DATASET/train.txt'  # Replace with actual path
#     validation_file_path = 'C:/Users/sxs230164/OneDrive - The University of Texas at Dallas/UTD FALL24/NLP/ass1/A1_DATASET/A1_DATASET/val.txt'  # Replace with actual path
    
#     train_reviews = load_data(train_file_path)
#     validation_reviews = load_data(validation_file_path)

#     # Compute raw unigram and bigram counts from training data
#     unigram_counts, bigram_counts, _ = compute_ngram_counts(train_reviews)

#     # Filter vocabulary to remove rare words (appearing less than `min_freq` times)
#     filtered_vocab = filter_vocabulary(unigram_counts, min_freq=2)
    
#     # Handle unknown words in training and validation sets
#     processed_train_reviews = handle_unknown_words(train_reviews, filtered_vocab)
#     processed_validation_reviews = handle_unknown_words(validation_reviews, filtered_vocab)

#     # Recompute n-gram counts for processed training data
#     unigram_counts, bigram_counts, total_word_count = compute_ngram_counts(processed_train_reviews)
    
#     # Vocabulary size after filtering
#     vocab_size = len(filtered_vocab)

#     # Compute smoothed bigram probabilities using Laplace smoothing
#     laplace_smoothed_probs = laplace_smoothing(unigram_counts, bigram_counts, vocab_size)

#     # Compute perplexity on validation set
#     perplexity = calculate_perplexity(processed_validation_reviews, laplace_smoothed_probs, unigram_counts, vocab_size)

#     print(f"Perplexity on validation set: {perplexity}")

# if __name__ == "__main__":
#     main()


# Laplace smoothing function with a lower alpha
def laplace_smoothing_optimized(unigram_counts, bigram_counts, vocab_size, alpha=0.01):  # Adjusted alpha for better smoothing
    smoothed_bigram_probs = defaultdict(lambda: defaultdict(float))

    for prev_word in bigram_counts:
        for curr_word in bigram_counts[prev_word]:
            smoothed_bigram_probs[prev_word][curr_word] = (
                (bigram_counts[prev_word][curr_word] + alpha) /
                (unigram_counts[prev_word] + alpha * vocab_size)
            )

    return smoothed_bigram_probs

# Interpolation function (bigram + unigram combination)
def interpolate_smoothing(unigram_counts, bigram_counts, vocab_size, lambda1=0.8, lambda2=0.2, alpha=0.01):
    smoothed_bigram_probs = defaultdict(lambda: defaultdict(float))

    for prev_word in bigram_counts:
        for curr_word in bigram_counts[prev_word]:
            bigram_prob = (bigram_counts[prev_word][curr_word] + alpha) / (unigram_counts[prev_word] + alpha * vocab_size)
            unigram_prob = (unigram_counts[curr_word] + alpha) / (sum(unigram_counts.values()) + alpha * vocab_size)
            
            # Interpolation between bigram and unigram probabilities
            smoothed_bigram_probs[prev_word][curr_word] = lambda1 * bigram_prob + lambda2 * unigram_prob

    return smoothed_bigram_probs

# Adjusted perplexity calculation with interpolation
def calculate_perplexity_interpolated(validation_reviews, smoothed_bigram_probs, unigram_counts, vocab_size, unknown_token="<UNK>"):
    N = sum(len(review) for review in validation_reviews)
    log_prob_sum = 0

    for review in validation_reviews:
        for i in range(1, len(review)):
            prev_word = review[i - 1]
            curr_word = review[i]

            if curr_word not in unigram_counts:
                curr_word = unknown_token
            if prev_word not in unigram_counts:
                prev_word = unknown_token

            # Use interpolated smoothing probability
            prob = smoothed_bigram_probs[prev_word].get(curr_word, 1 / (unigram_counts[prev_word] + vocab_size))
            log_prob_sum += math.log(prob)
            log_prob_sum_base2 = log_prob_sum 

    perplexity = 2 ** (-log_prob_sum_base2 / N)
    return perplexity

# Example usage
def main():
    # Assuming previous functions such as load_data and preprocess are defined
    train_file_path = 'C:/Users/sxs230164/OneDrive - The University of Texas at Dallas/UTD FALL24/NLP/ass1/A1_DATASET/A1_DATASET/train.txt'
    validation_file_path = 'C:/Users/sxs230164/OneDrive - The University of Texas at Dallas/UTD FALL24/NLP/ass1/A1_DATASET/A1_DATASET/val.txt'
    
    train_reviews = load_data(train_file_path)
    validation_reviews = load_data(validation_file_path)

    # Compute raw unigram and bigram counts
    unigram_counts, bigram_counts, _ = compute_ngram_counts(train_reviews)

    # Filter vocabulary (increase min_freq to 3 for more aggressive filtering)
    filtered_vocab = filter_vocabulary(unigram_counts, min_freq=3)
    
    # Handle unknown words in training and validation sets
    processed_train_reviews = handle_unknown_words(train_reviews, filtered_vocab)
    processed_validation_reviews = handle_unknown_words(validation_reviews, filtered_vocab)

    # Recompute n-gram counts for processed training data
    unigram_counts, bigram_counts, total_word_count = compute_ngram_counts(processed_train_reviews)
    
    # Vocabulary size after filtering
    vocab_size = len(filtered_vocab)

    # Compute smoothed bigram probabilities using optimized Laplace smoothing
    laplace_smoothed_probs = laplace_smoothing_optimized(unigram_counts, bigram_counts, vocab_size)

    # Compute interpolated smoothing probabilities
    interpolated_smoothed_probs = interpolate_smoothing(unigram_counts, bigram_counts, vocab_size)

    # Compute perplexity using interpolation
    perplexity = calculate_perplexity_interpolated(processed_validation_reviews, interpolated_smoothed_probs, unigram_counts, vocab_size)

    print(f"Perplexity on validation set: {perplexity}")

if __name__ == "__main__":
    main()


Perplexity on validation set: 23.379855133277093
