In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import os
import numpy as np

In [10]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [11]:
# Read and pre-process training dataset

def standardize_punctuations(text):
    replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
    for symbol, replacement in replacement_rules.items():
        text = text.replace(symbol, replacement)
    return text

def remove_stop_words(tokenized_sentence):
    filtered_sentence = []
    for word in tokenized_sentence:
        if word not in stop_words:
            filtered_sentence.append(word)
    return filtered_sentence

def get_data(input_path) :
     with open(input_path) as read_handle:
        text = read_handle.read()
        text = text.lower()
        text = standardize_punctuations(text)
        tokenized_sentences = sent_tokenize(text)
        return tokenized_sentences
     
def get_data_split(input_path) :
    tokenized_sentences = get_data(input_path)
    total_lines = len(tokenized_sentences)
    train_set = int(0.85 * total_lines)
    return tokenized_sentences[:train_set], tokenized_sentences[train_set:]
     
def tokenize_text(sentence_data, tokenized_path):
    with open(tokenized_path, 'w') as write_handle:
        for sentence in sentence_data:
            sentence = re.sub(r'[?!:.;,#@-`()]', '', sentence)
            tokenized_sentence = word_tokenize(sentence)
            tokenized_sentence = remove_stop_words(tokenized_sentence)
            lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_sentence]
            if lemmatized_words:
                lemmatized_words = lemmatized_words +['[END]']
                write_handle.write(','.join(lemmatized_words))
                write_handle.write('\n')

def tokenize_text_unk(sentence_data, tokenized_path, vocab):
    with open(tokenized_path, 'w') as write_handle:
        for sentence in sentence_data:
            sentence = re.sub(r'[?!:.;,#@-`()]', '', sentence)
            tokenized_sentence = word_tokenize(sentence)
            tokenized_sentence = remove_stop_words(tokenized_sentence)
            lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_sentence]
            tokenized_words = ["<UNK>" if word not in vocab else word for word in lemmatized_words]
            
            if tokenized_words:
                tokenized_words = tokenized_words + ['[END]']
                write_handle.write(','.join(tokenized_words))
                write_handle.write('\n')

def tokenize_with_threshold(sentence_data, tokenized_path, frequencies, threshold):
    with open(tokenized_path, 'w') as write_handle:
        for sentence in sentence_data:
            sentence = re.sub(r'[?!:.;,#@-`()]', '', sentence)
            tokenized_sentence = word_tokenize(sentence)
            tokenized_sentence = remove_stop_words(tokenized_sentence)
            lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_sentence]
            tokenized_words = [word if frequencies[word] >= threshold else "<UNK>" for word in lemmatized_words]
            if tokenized_words:
                tokenized_words = tokenized_words + ['[END]']
                write_handle.write(','.join(tokenized_words))
                write_handle.write('\n')
    vocab = set(word for word, freq in frequencies.items() if freq >= threshold)
    vocab.add("<UNK>")
    return vocab

In [12]:
def get_ngram_counts(data_file_path):
   bigramCounts = {}
   unigramCounts = {}
   total_words = 0
   
   with open(data_file_path) as read_handle:
      lines = read_handle.readlines()
      for line in lines:
         data = line.split(',')
         n = len(data)
         total_words += n
         for i in range(n):
            if i+1 < n:
               if (data[i], data[i+1]) in bigramCounts:
                  bigramCounts[(data[i], data[i + 1])] += 1
               else:
                  bigramCounts[(data[i], data[i + 1])] = 1

            if data[i] in unigramCounts:
               unigramCounts[data[i]] += 1
            else:
               unigramCounts[data[i]] = 1
   return unigramCounts, bigramCounts, total_words


In [13]:
def get_bigram_probability(unigram_counts, bigram_counts, output_file_path):
    bigram_prob = {}
    list_of_bigrams = bigram_counts.keys()
    with open(output_file_path, 'w') as write_handle:
        for bigram in list_of_bigrams:
            bigram_prob[bigram] = (bigram_counts[bigram])/(unigram_counts[bigram[0]])
            write_handle.write("P{} = {}".format(bigram, bigram_prob[bigram]))
            write_handle.write("\n")
    return bigram_prob

def get_unigram_probability(unigram_counts, total_train_words, output_file_path):
    unigram_prob = {}
    listOfUnigrams = unigram_counts.keys()
    with open(output_file_path, 'w') as write_handle:
        for unigram in listOfUnigrams:
            unigram_prob[unigram] = (unigram_counts[unigram])/total_train_words
            write_handle.write("P({}) = {}".format(unigram, unigram_prob[unigram]))
            write_handle.write("\n")
    return unigram_prob

In [14]:
def get_perplexity_unigram(unigrams_train, total_train_words, test_file_path, prob_file_path):
    total_words = 0
    unigramProb = get_unigram_probability(unigrams_train, total_train_words, prob_file_path)
    with open(test_file_path) as read_handle:
        lines = read_handle.readlines()
        sum_log = 0

        for line in lines:
            words = line.split(',')
            total_words += len(words)
            for word in words:
                pword = 0
                if word in unigramProb:
                    pword = unigramProb[word]
                sum_log += np.log(pword)
        
        sum_log = sum_log/total_words
        print("Perplexity for unigram : ", np.exp2(-1*sum_log))

def get_perplexity_bigram(bigrams_train, unigrams_train, test_file_path, prob_file_path):
    total_words = 0
    bigramProb = get_bigram_probability(unigrams_train, bigrams_train, prob_file_path)
    
    with open(test_file_path) as read_handle:
        lines = read_handle.readlines()
        sum_log = 0

        for line in lines:
            words = line.split(',')
            total_words += len(words)
            for i in range(len(words)-1):
                pbigram = 0
                if (words[i], words[i+1]) in bigramProb:
                    pbigram = bigramProb[(words[i], words[i+1])]
                sum_log += np.log(pbigram)
        
        sum_log = sum_log/total_words
        print("Perplexity for bigram : ", np.exp2(-1*sum_log))

In [16]:
# Part 1 - Unsmoothed ngrams

input_file_path = os.path.join(os.getcwd(), "A1_DATASET/train.txt")
tokenized_file_path = os.path.join(os.getcwd(), "A1_DATASET/tokenized_train.txt")
input_data  = get_data(input_file_path)
tokenize_text(input_data, tokenized_file_path)

unigrams_train, bigrams_train, total_train_words = get_ngram_counts(tokenized_file_path)

validation_file_path = os.path.join(os.getcwd(), "A1_DATASET/val.txt")
tokenized_val_file_path = os.path.join(os.getcwd(), "A1_DATASET/tokenized_val.txt")
val_data = get_data(validation_file_path)
tokenize_text(val_data, tokenized_val_file_path)

unigram_prob_file_path = os.path.join(os.getcwd(), "A1_DATASET/unsmoothed_unigram_prob.txt")
bigram_prob_file_path = os.path.join(os.getcwd(), "A1_DATASET/unsmoothed_bigram_prob.txt")
get_perplexity_unigram(unigrams_train, total_train_words, tokenized_val_file_path, unigram_prob_file_path)
get_perplexity_bigram(bigrams_train, unigrams_train, tokenized_val_file_path, bigram_prob_file_path)

Perplexity for unigram :  inf
Perplexity for bigram :  inf


  sum_log += np.log(pword)
  sum_log += np.log(pbigram)


In [17]:
# Part 2 - Unknown words by frequency >= method

tokenized_unk_file_path = os.path.join(os.getcwd(), "A1_DATASET/tokenized_unk_train.txt")
tokenized_unk_val_file_path = os.path.join(os.getcwd(), "A1_DATASET/tokenized_unk_val.txt")

tokenize_text(input_data, tokenized_unk_file_path)
unigrams_train,_,_ = get_ngram_counts(tokenized_unk_file_path)
vocab_unk = tokenize_with_threshold(input_data, tokenized_unk_file_path, unigrams_train, 5)

unigrams_unk_train, bigrams_unk_train, total_words = get_ngram_counts(tokenized_unk_file_path)

tokenize_text_unk(input_data, tokenized_unk_val_file_path, vocab_unk)

unigram_unk_prob_file_path = os.path.join(os.getcwd(), "A1_DATASET/unk_unigram_prob.txt") 
bigram_unk_prob_file_path = os.path.join(os.getcwd(), "A1_DATASET/unk_bigram_prob.txt") 
get_perplexity_unigram(unigrams_unk_train, total_words, tokenized_unk_val_file_path, unigram_unk_prob_file_path)
get_perplexity_bigram(bigrams_unk_train, unigrams_unk_train, tokenized_unk_val_file_path, bigram_unk_prob_file_path)

Perplexity for unigram :  45.470191681100054
Perplexity for bigram :  7.521551415738418
