In [2]:
import nltk
nltk.download('punkt')

import pandas as pd
from nltk.tokenize import word_tokenize
from csv import QUOTE_NONE

# Read the SST data
sst_data = pd.read_csv("SST-2/train.tsv", sep="\t")

#Take 100 rows in validation set
validation_set = sst_data[:100]

#Take 100 rows in test set
test_set = sst_data[100:200]

#put remaining rows in training set
training_set = sst_data[200:]

# initialize the positive and negative count
positive_count = (training_set['label'] == 1).sum()
negative_count = (training_set['label'] == 0).sum()

#length of training set
training_length = len(training_set)

#getting prior probability for positive and negative classes
prior_probab_positive = positive_count / training_length
prior_probab_negative = negative_count / training_length

#printing prior probability of both classes
print(f"The prior probability of positive class is {prior_probab_positive}")
print(f"The prior probability of negative class is {prior_probab_negative}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The prior probability of positive class is 0.5579681007907787
The prior probability of negative class is 0.44203189920922126


In [3]:
# A function to tokenized sentence and padding with start and end symbols
def sentence_tokenizer(sentence):
    tokenize_sentence = sentence.split()
    tokenize_sentence.insert(0, '<s>')
    tokenize_sentence.append('</s>')

    return tokenize_sentence

#tokenized the entire training dataset
tokenized_training_set = training_set['sentence'].apply(sentence_tokenizer)

#getting the first line from tokenized training set and printing it
first_line_trainingset = tokenized_training_set.iloc[0]
print(f"The tokenized first sentence is: {first_line_trainingset}")

#Getting a list of tokenized sentences in the training set
tokenizedlist_trainingset = tokenized_training_set.tolist()

total_tokens = []

#getting the unique words in total_tokens list
for i in tokenizedlist_trainingset:
    for j in i:
        total_tokens.append(j)

vocabulary_set = set(total_tokens)


vocabulary_size = len(vocabulary_set)                                       #getting the count of the unique words

print(f"The vocabulary size of training set is {vocabulary_size}")          #printing the count of unique words

The tokenized first sentence is: ['<s>', 'told', 'in', 'scattered', 'fashion', '</s>']
The vocabulary size of training set is 14813


In [4]:
# Function to count the biagram frequencies
def count_biagram_frequencies(tokenized_sequences):
    bigram_counts = {}                                                                  #Initializing a dictionary

    for tokens in tokenized_sequences:
        for i in range(len(tokens)-1):  
            first_word = tokens[i]                                                       # getting the first word
            second_word = tokens[i+1]                                                    # getting the second word

            bigram = (first_word, second_word)                                              
            if bigram not in bigram_counts:                                              # Getting the bigram counts
                bigram_counts[bigram]=0
            bigram_counts[bigram]+=1
    return bigram_counts


# Setting the bigram counts value in bigram_counts variable
bigram_counts = count_biagram_frequencies(tokenizedlist_trainingset)

#printing frequency of bigram '<s>' and 'the'
print(f"Frequency of bigram ('<s>', 'the') = {bigram_counts.get(('<s>', 'the'))}")

Frequency of bigram ('<s>', 'the') = 4450


In [5]:
import math

def smoothed_biagram_probab(wm, wm_1, bigram_counts, alpha, vocabulary_size):

    # count of biagram(wm, wm-1)
    count_bigram = bigram_counts.get((wm_1, wm), 0)

    #count of unigram(wm-1)
    count_unigram = 0
    for i, j in bigram_counts.items():
        if i[0] == wm_1:
            count_unigram +=j

    # To get smoothed probability using alpha smoothing formula
    smoothed_probability = (count_bigram + alpha) / (count_unigram + alpha * vocabulary_size)

    # TO get the negative log probability
    calc_negative_log_probab = math.log(smoothed_probability)

    return calc_negative_log_probab


word = 'award'
previous_word = 'academy'
# passing the parameters with alpha as 0.001 to the smoothed function to get the negative log probability
negative_log_probab1 = smoothed_biagram_probab(word, previous_word, bigram_counts, 0.001, 14813)

# passing the parameters with alpha as 0.5 to the smoothed function to get the negative log probability
negative_log_probab2 = smoothed_biagram_probab(word, previous_word, bigram_counts, 0.5, 14813)

# printing the negative log probability with alpha 0.001
print(f"The log probability with alpha= 0.001 is  {negative_log_probab1}")

# printing the negative log probability with alpha 0.5
print(f"The log probability with alpha= 0.5 is  {negative_log_probab2}")



The log probability with alpha= 0.001 is  -1.0250904304166832
The log probability with alpha= 0.5 is  -6.172912066128204


In [6]:
# Function for getting the log probability of a sentence
def sentence_log_probab(sentence, bigram_counts, alpha, vocabulary_size):
    # Splitting the sentence into words
    words_of_sentence = sentence.split()

    #Initializing the log probability    
    log_probability = 0.0

    # Looping through the words in sentence
    for i in range(1, len(words_of_sentence)):
        #Calculating negative log probability
        negative_log_probab = smoothed_biagram_probab(words_of_sentence[i], words_of_sentence[i-1], bigram_counts, alpha, vocabulary_size)
        log_probability+= negative_log_probab

    return log_probability

# Initializing first sentence, getting its log probability and printing it.
first_sentence = "this was a really great movie but it was a little too long."
log_probab1 = sentence_log_probab(first_sentence, bigram_counts, 0.1, 14813)
print(f"The log probability of first sentence is {log_probab1}")

# Initializing second sentence, getting its log probability and printing it.
second_sentence = "long too little a was it but movie great really a was this."
log_probab2 = sentence_log_probab(second_sentence, bigram_counts, 0.1, 14813)
print(f"The log probability of second sentence is {log_probab2}")

The log probability of first sentence is -69.91754047795261
The log probability of second sentence is -113.38139766388028


In [7]:
# All the alpha values
all_alpha_values = [0.001, 0.01, 0.1]

# initializing best log chances variable
best_log_chances = 0

# setting selected alpha as 0 initially
selected_alpha = 0

# looping through all the alpha values
for alpha in all_alpha_values:
    approximate_log_likelihood = 0

    # Applying log probability function to each sentence in the validation set
    for i in validation_set['sentence']:
        approximate_log_likelihood += sentence_log_probab(i, bigram_counts, alpha, vocabulary_size)

    # Printing the approximate log likelihood for current alpha
    print(f"The log likelihood estimate for alpha as {alpha} is {approximate_log_likelihood}")

    # Check which alpha gives the highest result
    if best_log_chances == 0 or approximate_log_likelihood > best_log_chances:
        best_log_chances = approximate_log_likelihood
        selected_alpha = alpha

# Printing the best alpha
print(f"\nThe best alpha is {selected_alpha}")


The log likelihood estimate for alpha as 0.001 is -3493.4581923786996
The log likelihood estimate for alpha as 0.01 is -3924.884625422819
The log likelihood estimate for alpha as 0.1 is -4795.08551812413

The best alpha is 0.001
