In [1]:
import re
from collections import defaultdict, Counter
import math
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import nltk
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruthw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruthw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ruthw\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# Preprocessing function with stopword removal and lemmatization
def preprocess_review(text, stop_words, lemmatizer):
    """Converts text to lowercase, removes punctuation, removes stopwords, lemmatizes, and tokenizes it."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    tokens = text.split()  # Split by whitespace

    # Remove stopwords and lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return tokens

In [4]:
# Function to read corpus
def read_corpus(filename, stop_words, lemmatizer):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    tokens = []
    for line in lines:
        tokens.extend(preprocess_review(line, stop_words, lemmatizer))
    return tokens

In [5]:
# Function to count n-grams
def count_ngrams(tokens, n):
    ngram_counts = defaultdict(int)
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i + n])
        ngram_counts[ngram] += 1
    return ngram_counts

In [6]:
# Function to calculate unsmoothed probabilities
def calculate_probabilities(ngram_counts, unigram_counts=None):
    probabilities = {}
    if unigram_counts:
        for ngram, count in ngram_counts.items():
            probabilities[ngram] = count / unigram_counts[(ngram[0],)]
    else:
        total_unigrams = sum(ngram_counts.values())
        probabilities = {unigram: count / total_unigrams for unigram, count in ngram_counts.items()}
    return probabilities

In [7]:
# Function to handle unknown words using Byte Pair Encoding (BPE)
def handle_unknown_words_bpe(tokens, min_freq=5):
    word_freq = Counter(tokens)
    vocab = {word for word, count in word_freq.items() if count >= min_freq}
    tokens = [word if word in vocab else "<UNK>" for word in tokens]
    return tokens

# Function to replace unknown words with nearest neighbor using Word2Vec
def handle_unknown_words_nearest_neighbor(tokens, model):
    known_vocab = set(model.wv.index_to_key)
    updated_tokens = []
    for token in tokens:
        if token in known_vocab:
            updated_tokens.append(token)
        else:
            # Find the most similar word if token is unknown
            similar_word = find_nearest_neighbor(token, known_vocab, model)
            updated_tokens.append(similar_word if similar_word else "<UNK>")
    return updated_tokens

In [8]:
# Function to find nearest neighbor using Word2Vec
def find_nearest_neighbor(word, known_vocab, model):
    try:
        max_similarity = -1
        similar_word = None
        word_vector = model.wv[word]

        for known_word in known_vocab:
            similarity = cosine_similarity([word_vector], [model.wv[known_word]])[0][0]
            if similarity > max_similarity:
                max_similarity = similarity
                similar_word = known_word

        if similar_word is None:
            return '<UNK>'
        return similar_word
    except KeyError:
        # If the word is not in the model's vocabulary, return '<UNK>' or handle differently
        return '<UNK>'

In [9]:
def apply_laplace_smoothing(ngram_counts, unigram_counts, vocab_size, n):
    probabilities = {}
    for ngram in ngram_counts:
        if n == 2:
            probabilities[ngram] = (ngram_counts[ngram] + 1) / (unigram_counts[(ngram[0],)] + vocab_size)
        elif n == 1:
            total_unigrams = sum(unigram_counts.values())
            probabilities[ngram] = (ngram_counts[ngram] + 1) / (total_unigrams + vocab_size)
    return probabilities

# Function to apply add-k smoothing
def apply_add_k_smoothing(ngram_counts, unigram_counts, vocab_size, n, k=0.5):
    probabilities = {}
    for ngram in ngram_counts:
        if n == 2:
            probabilities[ngram] = (ngram_counts[ngram] + k) / (unigram_counts[(ngram[0],)] + k * vocab_size)
        elif n == 1:
            total_unigrams = sum(unigram_counts.values())
            probabilities[ngram] = (ngram_counts[ngram] + k) / (total_unigrams + k * vocab_size)
    return probabilities

In [10]:
# Function to calculate perplexity
def calculate_perplexity(tokens, ngram_probs, n):
    N = len(tokens) - n + 1
    log_sum = 0
    for i in range(N):
        ngram = tuple(tokens[i:i + n])
        if ngram in ngram_probs:
            prob = ngram_probs[ngram]
        else:
            prob = 1e-8  # Assign a very small probability for unknown n-grams
        log_sum += math.log(prob)
    return math.exp(-log_sum / N)


In [16]:
# Main function
def main():
    # Load stopwords and lemmatizer
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Load and preprocess dataset
    train_file = "C:/Users/ruthw/Desktop/Uni/NPL/A1_DATASET/A1_DATASET/train.txt"
    validation_file = "C:/Users/ruthw/Desktop/Uni/NPL/A1_DATASET/A1_DATASET/val.txt"
    
    train_tokens = read_corpus(train_file, stop_words, lemmatizer)
    validation_tokens = read_corpus(validation_file, stop_words, lemmatizer)
    
    # Train Word2Vec model
    word2vec_model = Word2Vec([train_tokens], vector_size=100, window=5, min_count=1, workers=4)

    # Display top unigram and bigram frequencies before handling unknown words
    unigram_counts_before = count_ngrams(train_tokens, 1)
    bigram_counts_before = count_ngrams(train_tokens, 2)

    unigram_freqs_before = Counter(unigram_counts_before).most_common(10)
    bigram_freqs_before = Counter(bigram_counts_before).most_common(10)

    print("\nTop 10 Unigram Frequencies Before Handling Unknown Words:")
    for unigram, count in unigram_freqs_before:
        print(f"{unigram}: {count}")

    print("\nTop 10 Bigram Frequencies Before Handling Unknown Words:")
    for bigram, count in bigram_freqs_before:
        print(f"{bigram}: {count}")

    # Calculate and print unsmoothed probabilities for unigrams and bigrams before handling unknowns
    unigram_probs_before = calculate_probabilities(unigram_counts_before)
    bigram_probs_before = calculate_probabilities(bigram_counts_before, unigram_counts_before)

    print("\nTop 10 Unigram Probabilities Before Handling Unknown Words:")
    for unigram, prob in Counter(unigram_probs_before).most_common(10):
        print(f"{unigram}: {prob:.4f}")

    print("\nTop 10 Bigram Probabilities Before Handling Unknown Words:")
    for bigram, prob in Counter(bigram_probs_before).most_common(10):
        print(f"{bigram}: {prob:.4f}")

    # Handle unknown words using Byte Pair Encoding (BPE)
    train_tokens_bpe = handle_unknown_words_bpe(train_tokens)
    validation_tokens_bpe = handle_unknown_words_bpe(validation_tokens)
    
    # Handle unknown words using Nearest Neighbor Replacement
    train_tokens_nn = handle_unknown_words_nearest_neighbor(train_tokens, word2vec_model)
    validation_tokens_nn = handle_unknown_words_nearest_neighbor(validation_tokens, word2vec_model)
    
    # Loop through different unknown handling methods and smoothing techniques
    unknown_handling_methods = ["BPE", "Nearest Neighbor"]
    smoothing_methods = ["Laplace", "Add-k"]

    for method in unknown_handling_methods:
        if method == "BPE":
            train_tokens_method = train_tokens_bpe
            validation_tokens_method = validation_tokens_bpe
        elif method == "Nearest Neighbor":
            train_tokens_method = train_tokens_nn
            validation_tokens_method = validation_tokens_nn

        # Unsmoothed n-gram counts
        unigram_counts_method = count_ngrams(train_tokens_method, 1)
        bigram_counts_method = count_ngrams(train_tokens_method, 2)

        # Calculate and print unsmoothed probabilities after handling unknown words
        unigram_probs_method = calculate_probabilities(unigram_counts_method)
        bigram_probs_method = calculate_probabilities(bigram_counts_method, unigram_counts_method)

        print(f"\nTop 10 Unigram Probabilities After Handling Unknown Words using {method}:")
        for unigram, prob in Counter(unigram_probs_method).most_common(10):
            print(f"{unigram}: {prob:.4f}")

        print(f"\nTop 10 Bigram Probabilities After Handling Unknown Words using {method}:")
        for bigram, prob in Counter(bigram_probs_method).most_common(10):
            print(f"{bigram}: {prob:.4f}")

        # Display top unigram and bigram frequencies after handling unknown words
        unigram_freqs_method = Counter(unigram_counts_method).most_common(10)
        bigram_freqs_method = Counter(bigram_counts_method).most_common(10)

        print(f"\nTop 10 Unigram Frequencies After Handling Unknown Words using {method}:")
        for unigram, count in unigram_freqs_method:
            print(f"{unigram}: {count}")

        print(f"\nTop 10 Bigram Frequencies After Handling Unknown Words using {method}:")
        for bigram, count in bigram_freqs_method:
            print(f"{bigram}: {count}")

        # Loop through smoothing methods
        for smoothing in smoothing_methods:
            if smoothing == "Laplace":
                bigram_probs_smoothed = apply_laplace_smoothing(bigram_counts_method, unigram_counts_method, len(unigram_counts_method), 2)
            elif smoothing == "Add-k":
                bigram_probs_smoothed = apply_add_k_smoothing(bigram_counts_method, unigram_counts_method, len(unigram_counts_method), 2, k=0.5)

            # Calculate perplexity
            perplexity = calculate_perplexity(validation_tokens_method, bigram_probs_smoothed, 2)
            print(f"\nPerplexity of {method} bigram model with {smoothing} smoothing: {perplexity:.4f}")

if __name__ == '__main__':
    main()



Top 10 Unigram Frequencies Before Handling Unknown Words:
('hotel',): 1146
('room',): 1140
('stay',): 417
('great',): 356
('n',): 339
('chicago',): 329
('would',): 327
('night',): 294
('staff',): 270
('service',): 266

Top 10 Bigram Frequencies Before Handling Unknown Words:
('front', 'desk'): 103
('room', 'service'): 66
('hotel', 'room'): 43
('michigan', 'ave'): 36
('stay', 'hotel'): 36
('could', 'n'): 33
('bed', 'comfortable'): 32
('recommend', 'hotel'): 32
('great', 'location'): 31
('room', 'clean'): 31

Top 10 Unigram Probabilities Before Handling Unknown Words:
('hotel',): 0.0287
('room',): 0.0285
('stay',): 0.0104
('great',): 0.0089
('n',): 0.0085
('chicago',): 0.0082
('would',): 0.0082
('night',): 0.0074
('staff',): 0.0068
('service',): 0.0067

Top 10 Bigram Probabilities Before Handling Unknown Words:
('honoring', 'request'): 1.0000
('16th', 'floor'): 1.0000
('constitute', 'placing'): 1.0000
('placing', 'someone'): 1.0000
('justify', 'decide'): 1.0000
('decide', 'stay'): 1.000