In [None]:

import math
import os.path
import sys
import random
from operator import itemgetter
from collections import defaultdict
#----------------------------------------
#  Data input
#----------------------------------------

# Read a text file into a corpus (list of sentences (which in turn are lists of words))
# (taken from nested section of HW0)
def readFileToCorpus(f):
    """ Reads in the text file f which contains one sentence per line.
    """
    if os.path.isfile(f):
        file = open(f, "r") # open the input file in read-only mode
        i = 0 # this is just a counter to keep track of the sentence numbers
        corpus = [] # this will become a list of sentences
        print("Reading file ", f)
        for line in file:
            i += 1
            sentence = line.split() # split the line into a list of words
            #append this lis as an element to the list of sentences
            corpus.append(sentence)
            if i % 1000 == 0:
    	#print a status message: str(i) turns int i into a string
    	#so we can concatenate it
                sys.stderr.write("Reading sentence " + str(i) + "\n")
        #endif
    #endfor
        return corpus
    else:
    #ideally we would throw an exception here, but this will suffice
        print("Error: corpus file ", f, " does not exist")
        sys.exit() # exit the script
    #endif
#enddef


# Preprocess the corpus
def preprocess(corpus):
    #find all the rare words
    freqDict = defaultdict(int)
    for sen in corpus:
	    for word in sen:
	       freqDict[word] += 1
	#endfor
    #endfor

    #replace rare words with unk
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            print(word)
            print(freqDict[word])
            if freqDict[word] < 2:

                sen[i] = UNK
	    #endif
	#endfor
    #endfor

    #bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    #endfor

    return corpus
#enddef

def preprocessTest(vocab, corpus):
    #replace test words that were unseen in the training with unk
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            if word not in vocab:
                sen[i] = UNK
	    #endif
	#endfor
    #endfor

    #bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    #endfor

    return corpus
#enddef

# Constants
UNK = "UNK"     # Unknown word token
start = "<s>"   # Start-of-sentence token
end = "</s>"    # End-of-sentence-token


#--------------------------------------------------------------
# Language models and data structures
#--------------------------------------------------------------

# Parent class for the three language models you need to implement
class LanguageModel:
    # Initialize and train the model (ie, estimate the model's underlying probability
    # distribution from the training corpus)
    def __init__(self, corpus):
        print("""Your task is to implement four kinds of n-gram language models:
      a) an (unsmoothed) unigram model (UnigramModel)
      b) a unigram model smoothed using Laplace smoothing (SmoothedUnigramModel)
      c) an unsmoothed bigram model (BigramModel)
      d) a bigram model smoothed using linear interpolation smoothing (SmoothedBigramModelInt)
      """)
    #enddef

    # Generate a sentence by drawing words according to the
    # model's probability distribution
    # Note: think about how to set the length of the sentence
    #in a principled way
    def generateSentence(self):
        print("Implement the generateSentence method in each subclass")
        return "mary had a little lamb ."
    #emddef

    # Given a sentence (sen), return the probability of
    # that sentence under the model
    def getSentenceProbability(self, sen):
        print("Implement the getSentenceProbability method in each subclass")
        return 0.0
    #enddef

    # Given a corpus, calculate and return its perplexity
    #(normalized inverse log probability)
    def getCorpusPerplexity(self, corpus):
        print("Implement the getCorpusPerplexity method")
        return 0.0
    #enddef

    # Given a file (filename) and the number of sentences, generate a list
    # of sentences and write each to file along with its model probability.
    # Note: you shouldn't need to change this method
    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0,numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

	#endfor
    #enddef
#endclass

# Unigram language model
class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.unigram_dist = UnigramDist(corpus)
    #enddef

    def generateSentence(self):
        sentence = [start]
        while True:
            word = self.unigram_dist.draw()
            if word == end:
                break
            sentence.append(word)
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= self.unigram_dist.prob(word)
        return probability
    #enddef
#endclass

class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.unigram_dist = UnigramDist(corpus)
        self.vocab_size = len(set(word for sen in corpus for word in sen))
    #enddef

    def generateSentence(self):
        sentence = [start]
        while True:
            word = self.unigram_dist.draw()
            if word == end:
                break
            sentence.append(word)
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= (self.unigram_dist.counts[word] + 1) / (self.unigram_dist.total + self.vocab_size)
        return probability
    #enddef
#endclass


class BigramModel(LanguageModel):
    def __init__(self, corpus):
        self.bigram_dist = defaultdict(UnigramDist)  # Initialize with UnigramDist objects
        self.train(corpus)  # Pass the corpus to the train method

    def train(self, corpus):
        for sen in corpus:
            for i in range(len(sen) - 1):
                bigram_key = sen[i]
                if bigram_key not in self.bigram_dist:  # Check if UnigramDist exists
                    self.bigram_dist[bigram_key] = UnigramDist(corpus)  # Initialize with corpus
                self.bigram_dist[bigram_key].train([sen[i + 1]])  # Train with conditional context







    def generateSentence(self):
        sentence = [start]
        while True:
            word = self.bigram_dist[sentence[-1]].draw()
            if word == end:
                break
            sentence.append(word)
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(len(sen) - 1):
            probability *= self.bigram_dist[sen[i]].prob(sen[i + 1])
        return probability
    #enddef
#endclass

class SmoothedBigramModelKN(LanguageModel):
    def __init__(self, corpus):
        self.bigram_dist = defaultdict(UnigramDist)
        self.vocab_size = len(set(word for sen in corpus for word in sen))
        for sen in corpus:
            for i in range(len(sen) - 1):
                self.bigram_dist[sen[i]].train([sen[i + 1]])
    #enddef

    def generateSentence(self):
        sentence = [start]
        while True:
            word = self.bigram_dist[sentence[-1]].draw()
            if word == end:
                break
            sentence.append(word)
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(len(sen) - 1):
            lambda_factor = 0.5  # You may adjust this value
            probability *= ((1 - lambda_factor) * self.bigram_dist[sen[i]].prob(sen[i + 1]) +
                            lambda_factor * self.unigram_dist.prob(sen[i + 1]))
        return probability
    #enddef
#endclass

# Add this method to the UnigramDist class
def getCorpusPerplexity(self, corpus):
    log_likelihood = 0.0
    total_words = 0
    for sen in corpus:
        for word in sen:
            total_words += 1
            log_likelihood += math.log(self.prob(word))
    perplexity = math.exp(-log_likelihood / total_words)
    return perplexity
#enddef
#UnigramDist.getCorpusPerplexity = getCorpusPerplexity


# Sample class for a unsmoothed unigram probability distribution
# Note:
#       Feel free to use/re-use/modify this class as necessary for your
#       own code (e.g. converting to log probabilities after training).
#       This class is intended to help you get started
#       with your implementation of the language models above.
class UnigramDist:
    def __init__(self, corpus):
        self.counts = defaultdict(float)
        self.total = 0.0
        self.train(corpus)
    #endddef

    # Add observed counts from corpus to the distribution
    def train(self, corpus):
        for sen in corpus:
            for word in sen:
                if word == start:
                    continue
                self.counts[word] += 1.0
                self.total += 1.0
            #endfor
        #endfor
    #enddef

    # Returns the probability of word in the distribution
    def prob(self, word):
        return self.counts[word]/self.total
    #enddef

    # Generate a single random word according to the distribution
    def draw(self):
        rand = random.random()
        for word in self.counts.keys():
            rand -= self.prob(word)
            if rand <= 0.0:
                return word
	    #endif
	#endfor
    #enddef
#endclass

#-------------------------------------------
# The main routine
#-------------------------------------------
if __name__ == "__main__":
    #read your corpora
    trainCorpus = readFileToCorpus('train.txt')
    trainCorpus = preprocess(trainCorpus)

    posTestCorpus = readFileToCorpus('pos_test.txt')
    negTestCorpus = readFileToCorpus('neg_test.txt')

    # Instantiate models
    unigram_model = UnigramModel(trainCorpus)
    smoothed_unigram_model = SmoothedUnigramModel(trainCorpus)
    bigram_model = BigramModel(trainCorpus)
    smoothed_bigram_model = SmoothedBigramModelKN(trainCorpus)

    # Generate sentences and write to files
    unigram_model.generateSentencesToFile(20, 'unigram_output.txt')
    smoothed_unigram_model.generateSentencesToFile(20, 'smooth_unigram_output.txt')
    bigram_model.generateSentencesToFile(20, 'bigram_output.txt')
    smoothed_bigram_model.generateSentencesToFile(20, 'smooth_bigram_kn_output.txt')

    # Calculate perplexity for test corpora
    unigram_perplexity_pos = unigram_model.unigram_dist.getCorpusPerplexity(posTestCorpus)
    unigram_perplexity_neg = unigram_model.unigram_dist.getCorpusPerplexity(negTestCorpus)

    smoothed_unigram_perplexity_pos = smoothed_unigram_model.unigram_dist.getCorpusPerplexity(posTestCorpus)
    smoothed_unigram_perplexity_neg = smoothed_unigram_model.unigram_dist.getCorpusPerplexity(negTestCorpus)

    bigram_perplexity_pos = bigram_model.getCorpusPerplexity(posTestCorpus)
    bigram_perplexity_neg = bigram_model.getCorpusPerplexity(negTestCorpus)

    smoothed_bigram_perplexity_pos = smoothed_bigram_model.getCorpusPerplexity(posTestCorpus)
    smoothed_bigram_perplexity_neg = smoothed_bigram_model.getCorpusPerplexity(negTestCorpus)

    # Answering the questions
    print("1. The length of generated sentences with the unigram model is controlled by the random generation of words, and it stops when the end-of-sentence marker </s> is generated. The bigram model generates sentences based on the likelihood of the next word given the previous one.")
    print("2. The models might assign different probabilities to sentences due to their different underlying probability distributions. Smoothing techniques can also impact the probabilities assigned to unseen or rare events.")
    print("3. Generating additional sentences is subjective, but you can compare the diversity and coherence of sentences from both models. Smoothed bigram models are generally expected to produce more realistic sentences.")
    print("4. Compare the perplexity values for each test corpus across models. Higher perplexity indicates a less accurate model for the corpus.")
    print("Unigram Perplexity (Pos Test):", unigram_perplexity_pos)
    print("Unigram Perplexity (Neg Test):", unigram_perplexity_neg)
    print("Smoothed Unigram Perplexity (Pos Test):", smoothed_unigram_perplexity_pos)
    print("Smoothed Unigram Perplexity (Neg Test):", smoothed_unigram_perplexity_neg)
    print("Bigram Perplexity (Pos Test):", bigram_perplexity_pos)
    print("Bigram Perplexity (Neg Test):", bigram_perplexity_neg)
    print("Smoothed Bigram Perplexity (Pos Test):", smoothed_bigram_perplexity_pos)
    print("Smoothed Bigram Perplexity (Neg Test):", smoothed_bigram_perplexity_neg)



Reading file  train.txt


Reading sentence 1000
Reading sentence 2000
Reading sentence 3000
Reading sentence 4000
Reading sentence 5000
Reading sentence 6000
Reading sentence 7000
Reading sentence 8000
Reading sentence 9000
Reading sentence 10000
Reading sentence 11000
Reading sentence 12000
Reading sentence 13000
Reading sentence 14000
Reading sentence 15000
Reading sentence 16000
Reading sentence 17000
Reading sentence 18000
Reading sentence 19000
Reading sentence 20000
Reading sentence 21000
Reading sentence 22000
Reading sentence 23000
Reading sentence 24000
Reading sentence 25000
Reading sentence 26000
Reading sentence 27000
Reading sentence 28000
Reading sentence 29000
Reading sentence 30000


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
)
5516
and
18041
"
7713
the
37617
shoveller
1
(
5489
william
97
h
23
.
30743
macy
15
)
5516
.
30743
sure
234
they
1943
were
558
enterataining
1
,
38769
but
4079
their
1542
acts
33
grew
20
old
353
fast
61
.
30743
that
7085
is
12771
until
226
they
1943
aquire
1
"
7713
invisible
11
boy
156
"
7713
(
5489
kel
2
mitchell
35
)
5516
,
38769
and
18041
"
7713
the
37617
bowler
7
"
7713
(
5489
janeane
8
garafalo
2
)
5516
and
18041
"
7713
mr
159
.
30743
splein
2
"
7713
(
5489
paul
88
reubens
2
)
5516
,
38769
2
158
of
16878
which
1606
rescue
43
the
37617
film
4441
from
2434
becoming
53
a
18437
disastorous
1
mess
27
.
30743
thankfully
44
,
38769
the
37617
original
313
3
79
heroes
43
become
292
amusing
57
,
38769
with
5345
some
1324
support
48
of
16878
reuben
3
and
18041
garfalo
1
on
3325
screen
309
.
30743
the
37617
whole
206
premise
94
is
12771
rather
320
ridiculous
21
,
38769
but
4079
packs
9
a
18437
few
448
punches
6
to
14987
keep
17

Reading sentence 1000
Reading sentence 1000
