In [1]:
import os
import sys
from collections import defaultdict
import math
import random

# Constants
UNK = "UNK"
START = "<s>"
END = "</s>"

# Read file into corpus
def readFileToCorpus(f):
    if os.path.isfile(f):
        with open(f, "r") as file:
            corpus = [[START] + line.strip().split() + [END] for line in file]
        return corpus
    else:
        print(f"Error: corpus file {f} does not exist")
        sys.exit()

# Preprocess the corpus
def preprocess(corpus):
    freqDict = defaultdict(int)
    for sen in corpus:
        for word in sen:
            freqDict[word] += 1

    for sen in corpus:
        for i in range(len(sen)):
            if freqDict[sen[i]] < 2:
                sen[i] = UNK
    return corpus

# Preprocess test corpus
def preprocessTest(vocab, corpus):
    for sen in corpus:
        for i in range(len(sen)):
            if sen[i] not in vocab:
                sen[i] = UNK
    return corpus

def generateSentencesToFile(model, filename, num_sentences=20):
    with open(filename, "w") as file:
        for _ in range(num_sentences):
            sentence = model.generateSentence()
            file.write(sentence + "\n")

# Language Model Base Class
class LanguageModel:
    def generateSentence(self):
        raise NotImplementedError

    def getSentenceProbability(self, sen):
        raise NotImplementedError

    def getCorpusPerplexity(self, corpus):
        log_prob = 0
        total_words = 0
        for sen in corpus:
            prob = self.getSentenceProbability(sen)
            if prob > 0:
                log_prob += math.log(prob)
                total_words += len(sen) - 1
        return math.exp(-log_prob / total_words) if total_words > 0 else float('inf')

# Unigram Model
class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.counts = defaultdict(int)
        self.total = 0
        for sen in corpus:
            for word in sen:
                self.counts[word] += 1
                self.total += 1

    def getSentenceProbability(self, sen):
        prob = 1.0
        for word in sen[1:]:
            prob *= self.counts[word] / self.total
        return prob
    
    def generateSentence(self):
        sentence = [START]
        while sentence[-1] != END:
            next_word = random.choices(list(self.counts.keys()), weights=self.counts.values())[0]
            sentence.append(next_word)
        return " ".join(sentence[1:-1])  # Remove <s> and </s>

# Smoothed Unigram Model
class SmoothedUnigramModel(UnigramModel):
    def getSentenceProbability(self, sen):
        vocab_size = len(self.counts)
        prob = 1.0
        for word in sen[1:]:
            prob *= (self.counts[word] + 1) / (self.total + vocab_size)
        return prob
    
    def generateSentence(self):
        sentence = [START]
        while sentence[-1] != END:
            next_word = random.choices(list(self.counts.keys()), weights=self.counts.values())[0]
            sentence.append(next_word)
        return " ".join(sentence[1:-1])  # Remove <s> and </s>

# Bigram Model
class BigramModel(LanguageModel):
    def __init__(self, corpus):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        for sen in corpus:
            for i in range(len(sen) - 1):
                self.unigram_counts[sen[i]] += 1
                self.bigram_counts[sen[i]][sen[i+1]] += 1

    def getSentenceProbability(self, sen):
        prob = 1.0
        for i in range(len(sen) - 1):
            if self.unigram_counts[sen[i]] > 0:
                prob *= self.bigram_counts[sen[i]][sen[i+1]] / self.unigram_counts[sen[i]]
            else:
                return 0.0
        return prob
    
    def generateSentence(self):
        sentence = [START]
        while sentence[-1] != END:
            prev_word = sentence[-1]
            if prev_word in self.bigram_counts:
                next_word = random.choices(
                    list(self.bigram_counts[prev_word].keys()),
                    weights=self.bigram_counts[prev_word].values()
                )[0]
            else:
                next_word = END  # If no bigram exists, end the sentence
            sentence.append(next_word)
        return " ".join(sentence[1:-1])  # Remove <s> and </s>

# Smoothed Bigram Model
class SmoothedBigramModel(BigramModel):
    def getSentenceProbability(self, sen):
        vocab_size = len(self.unigram_counts)
        prob = 1.0
        for i in range(len(sen) - 1):
            prob *= (self.bigram_counts[sen[i]][sen[i+1]] + 1) / (self.unigram_counts[sen[i]] + vocab_size)
        return prob
    
    def generateSentence(self):
        sentence = [START]
        while sentence[-1] != END:
            prev_word = sentence[-1]
            if prev_word in self.bigram_counts:
                next_word = random.choices(
                    list(self.bigram_counts[prev_word].keys()),
                    weights=self.bigram_counts[prev_word].values()
                )[0]
            else:
                next_word = END  # If no bigram exists, end the sentence
            sentence.append(next_word)
        return " ".join(sentence[1:-1])  # Remove <s> and </s>

# Main execution
if __name__ == "__main__":
    trainCorpus = preprocess(readFileToCorpus('train.txt'))
    vocab = set(word for sen in trainCorpus for word in sen)
    posTestCorpus = preprocessTest(vocab, readFileToCorpus('pos_test.txt'))
    negTestCorpus = preprocessTest(vocab, readFileToCorpus('neg_test.txt'))
    
    models = {
        "Unigram": UnigramModel(trainCorpus),
        "Smoothed Unigram": SmoothedUnigramModel(trainCorpus),
        "Bigram": BigramModel(trainCorpus),
        "Smoothed Bigram": SmoothedBigramModel(trainCorpus)
    }
    
    for name, model in models.items():
        print(f"{name} Model Perplexity:")
        print(f"Positive Test: {model.getCorpusPerplexity(posTestCorpus)}")
        print(f"Negative Test: {model.getCorpusPerplexity(negTestCorpus)}\n")

    unigram = UnigramModel(trainCorpus)
    smooth_unigram = SmoothedUnigramModel(trainCorpus)
    bigram = BigramModel(trainCorpus)
    smooth_bigram = SmoothedBigramModel(trainCorpus)

    generateSentencesToFile(unigram, "unigram_output.txt")
    generateSentencesToFile(smooth_unigram, "smooth_unigram_output.txt")
    generateSentencesToFile(bigram, "bigram_output.txt")
    generateSentencesToFile(smooth_bigram, "smooth_bigram_kn_output.txt")


Unigram Model Perplexity:
Positive Test: 653.8657692515221
Negative Test: 636.8014658788888

Smoothed Unigram Model Perplexity:
Positive Test: 656.2273466083091
Negative Test: 639.4011766176395

Bigram Model Perplexity:
Positive Test: 47.80126358140742
Negative Test: 49.02762851428299

Smoothed Bigram Model Perplexity:
Positive Test: 1417.7376675989808
Negative Test: 1436.161940495924




#### 1.
The sentence length is typically determined by the probability of encountering an end-of-sentence token (`</s>`) in unigrams. Unigrams consider each word independently without context. Bigrams determine the next word based on the previous word, leading to more structured sentences. This control sentences length more naturally as the likelihood of `</s>` depends on the preceding word and not randomly.

#### 2.
Yes because:
- The unigram model considers independence between words leading to less structured sentences.
- The bigram model assumes context influencing word selection resulting in more realistic sentences.
- The smoothed bigram model adjusts probabilities to handle unseen word pairs altering likelihood of sentences.

#### 3.
The bigram model generally produces more coherent sentences than the unigram model because of word order and context. The smoothed bigram model tends to introduce more randomness due to smoothing leading to unnatural word pairings.

#### 4.
- **Unigram Model Perplexity:**
  - Positive Test: **653.87**
  - Negative Test: **636.80**

- **Smoothed Unigram Model Perplexity:**
  - Positive Test: **656.23**
  - Negative Test: **639.40**

- **Bigram Model Perplexity:**
  - Positive Test: **47.80**
  - Negative Test: **49.03**

- **Smoothed Bigram Model Perplexity:**
  - Positive Test: **1417.74**
  - Negative Test: **1436.16**

For the unigram and smoothed unigram models, the positive test corpus has a slightly higher perplexity. This suggests that the word distribution in the positive test set differs more from the training set. For the bigram and smoothed bigram models, the negative test corpus has a slightly higher perplexity than the positive test corpus. This suggests that smoothing introduces a greater degree of uncertainty, making it less effective in predicting test data compared to the regular bigram model.
