In [1]:
# Imorting necessary libraries
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings('ignore')

# Task 1:-

In [2]:
from collections import defaultdict
from typing import List

# Our Tokenizer Class
class Tokenizer:
    # Class Constructor to declare and initialize variables
    def __init__(self):
        self.vocab = set()  # Vocabulary, unique tokens in the corpus
        self.vocab.add("$")
        self.frequency_corpus = defaultdict(int)  # Unigram counts, frequency of each unique token
        self.merge_rules = []    # All the merge rules learned and to be appended in order as they are learned 

    # a function that learns the vocabulary
    def learn_vocabulary(self, corpus: List[str], num_merges: int):
        # Initialize vocabulary with characters and their frequencies
        
        # Construct a frequency corpus i.e. Unigram counts of unique tokens
        for sentence in corpus:
            for word in sentence.split():
                modified_word = ""
                for char in word:
                    self.vocab.add(char)
                    modified_word = modified_word + char + "-"
                modified_word = modified_word + "$"
                self.frequency_corpus[modified_word] += 1

        # Learn the merge rules
        for merge in range(num_merges):
            pair_frequency = defaultdict(int)     # This stores the character pair frequency for each merge rule iteration 
            
            # Generating pair frequencies
            for word in self.frequency_corpus:
                symbols = word.split("-")
                for i in range(len(symbols) - 1):
                    pair_frequency[(symbols[i], symbols[i + 1])] += 1

            if not pair_frequency:
                break

            # Get the character pair frequency with maximum count
            best_pair = max(pair_frequency, key=pair_frequency.get)
            self.vocab.add(best_pair[0]+best_pair[1])
            
            # Add that pair with maximum frequency in the merge rules if it doesn't exist
            if best_pair not in self.merge_rules:
                self.merge_rules.append(best_pair)

            # Update the frequency corpus for next iteration
            new_frequency_corpus = defaultdict(int)
            for word in self.frequency_corpus:
                new_word = word.replace(best_pair[0]+"-"+best_pair[1], best_pair[0]+best_pair[1])
                new_frequency_corpus[new_word] = self.frequency_corpus[word]
            self.frequency_corpus = new_frequency_corpus
        
        ### WRITING MERGE RULES TO TEXT FILE
        # Clear past contents of the file
        try:
            with open("merge_rules.txt", "w") as file:
                pass
        except Exception as e:
            print(f"Error clearing file: {e}")
            
        # Write all merge_rules to merge_rules.txt for submittables
        for rule in self.merge_rules:
            r = ",".join(rule)
            try:
                with open("merge_rules.txt", "a") as file:
                    file.write(r + "\n")
            except Exception as e:
                print(f"Error writing merge rule to merge_rules.txt: {e}")
            
        
        ### WRITING VOCABULARY TO TEXT FILE
        # Clear past contents of the file
        try:
            with open("vocab.txt", "w") as file:
                pass
        except Exception as e:
            print(f"Error clearing file: {e}")
            
        # Write all tokens found to tokens.txt for submittables
        tokens = list(self.vocab)
        tokens = sorted(tokens, key=lambda x: (len(x), x))
        for token in tokens:
            try:
                with open("vocab.txt", "a") as file:
                    file.write(token + "\n")
            except Exception as e:
                print(f"Error writing token to vocab.txt: {e}")
        
        
    # A fucntion that tokenizes the text based on merge rules
    def tokenize(self, sample: str) -> List[str]:
        actual_tokens = []
        
        # A set to store intermediate tokens
        intermediate_tokens = set()
        
        # Split the snetence into list of words
        sample_list = sample.split()

        for word in sample_list:
            word = word+"$"
            wordpart_list = [] # To keep track of individual characters
            for letter in word:
                wordpart_list.append(letter)
                intermediate_tokens.add(letter)

            # For each rule in merge_rules combine the corresonding pairs of characters in the word
            for rule in self.merge_rules:
                i=0
                while i<len(wordpart_list)-1:
                    if rule[0]+rule[1] == wordpart_list[i]+wordpart_list[i+1]:
                        intermediate_tokens.add(wordpart_list[i]+wordpart_list[i+1])
                        wordpart_list[i] = wordpart_list[i]+wordpart_list[i+1]
                        del wordpart_list[i+1]
                    i = i+1
            # Add the final merged tokens to actual tokens list
            for subpart in wordpart_list:
                actual_tokens.append(subpart)
        print("All found tokens: ", intermediate_tokens)
        
        ### WRITING SAMPLE TOKENS TO TEXT FILE
            
        # Write all tokens found to tokenized_samples.txt for submittables
        try:
            with open("tokenized_samples.txt", "a") as file:
                file.write(",".join(actual_tokens) + "\n")
        except Exception as e:
            print(f"Error writing tokens of sammple to tokenized_samples.txt: {e}") 
        
        
        ### WRITING ALL FOUND TOKENS TO TEXT FILE
            
        # Write all tokens found to tokens.txt for submittables
        tokens = list(intermediate_tokens)
        tokens = sorted(tokens, key=lambda x: (len(x), x))
        for token in tokens:
            try:
                with open("tokens.txt", "a") as file:
                    file.write(token + "\n")
            except Exception as e:
                print(f"Error writing token to tokens.txt: {e}")
             
        print("Tokenized sample: ", actual_tokens)
        return actual_tokens
                


# # Example Usage
# corpus = ["hello", "world", "he", "she", "hers"]
# tokenizer = Tokenizer()
# tokenizer.learn_vocabulary(corpus, num_merges=10)
# sample_text = "helloworld"
# tokens = tokenizer.tokenize(sample_text)
# print(tokens)

In [3]:
# Read corpus from a text file
# corpus_file_path = r"C:\Users\shree\OneDrive\Desktop\IIITD Sem 4\NLP\A1\corpus.txt"
with open("corpus.txt", 'r', encoding='utf-8') as file:
    corpus = file.read().splitlines()

tokenizer = Tokenizer()
tokenizer.learn_vocabulary(corpus, 10)

In [4]:
tokenizer.merge_rules

[('s', '$'),
 ('i', 'n'),
 ('e', '$'),
 ('d', '$'),
 ('e', 'r'),
 ('e', 'd$'),
 ('in', 'g'),
 ('y', '$'),
 ('ing', '$'),
 ('o', 'n')]

In [5]:
tokenizer.vocab

{'$',
 'a',
 'b',
 'c',
 'd',
 'd$',
 'e',
 'e$',
 'ed$',
 'er',
 'f',
 'g',
 'h',
 'i',
 'in',
 'ing',
 'ing$',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'on',
 'p',
 'q',
 'r',
 's',
 's$',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'y$',
 'z'}

In [6]:
tokenizer.tokenize("i love nlp ")

All found tokens:  {'i', 'o', 'p', 'e$', '$', 'n', 'l', 'e', 'v'}
Tokenized sample:  ['i', '$', 'l', 'o', 'v', 'e$', 'n', 'l', 'p', '$']


['i', '$', 'l', 'o', 'v', 'e$', 'n', 'l', 'p', '$']

In [7]:
tokenizer.frequency_corpus

defaultdict(int,
            {'i-$': 3789,
             's-t-a-n-d$': 10,
             'h-er-e$': 37,
             'f-e-e-l-$': 1637,
             'e-m-p-t-y$': 9,
             'a-$': 916,
             'c-l-a-s-s$': 13,
             'p-o-s-t-$': 14,
             'c-o-u-n-t-$': 4,
             'l-in-k-$': 2,
             'h-r-e-f-$': 25,
             'h-t-t-p-$': 30,
             'm-o-o-s-h-i-l-u-$': 1,
             'l-i-t-er-a-l-l-y$': 4,
             'j-u-s-t-$': 226,
             't-e-x-t-$': 3,
             't-y-c-h-e-l-l-e$': 1,
             't-o-$': 1340,
             's-e-e$': 45,
             'i-f-$': 134,
             's-h-e$': 72,
             'w-a-n-t-s$': 7,
             'h-a-n-g-$': 1,
             'o-u-t-$': 113,
             'b-e-c-a-u-s-e$': 183,
             'r-e-a-d-ing$': 13,
             'w-h-a-t-$': 125,
             'w-r-o-t-e$': 4,
             'a-b-o-u-t-$': 298,
             'm-y$': 642,
             'n-on-e-x-i-s-t-e-n-t-$': 1,
             's-o-c-i-a-l-$': 7,


# TASK 2
### Task 2: Subtask 1&2&3: Implement Bigram Langage Model and Smoothing methods in BigramLM class

In [3]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
     ---------------------------------------- 8.4/8.4 MB 14.5 MB/s eta 0:00:00
Collecting filelock (from transformers)
  Downloading filelock-3.13.1-py3-none-any.whl (11 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl (330 kB)
     ------------------------------------- 330.1/330.1 kB 21.3 MB/s eta 0:00:00
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.1-cp311-none-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 27.8 MB/s eta 0:00:00
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp311-none-win_amd64.whl (269 kB)
     ------------------------------------- 269.6/269.6 kB 17.3 MB/s eta 0:00:00
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.19.3->tr


[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Using code from utils.py directly

In [51]:
from transformers import pipeline

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True,)

def emotion_scores(sample): 
    emotion=classifier(sample)
    return emotion[0]

Some layers from the model checkpoint at bhadresh-savani/distilbert-base-uncased-emotion were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/distilbert-base-uncased-emotion and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
# Checkign the function emotion_scores() from utils.py
emotion_scores("cheerful")

[{'label': 'sadness', 'score': 0.0013014927972108126},
 {'label': 'joy', 'score': 0.9962917566299438},
 {'label': 'love', 'score': 0.0008716191514395177},
 {'label': 'anger', 'score': 0.0008998109260573983},
 {'label': 'fear', 'score': 0.00034155105822719634},
 {'label': 'surprise', 'score': 0.0002938536345027387}]

### Task 2: SubTask 1 & 2 & 3
### All BigramLM() class methods, Laplace and KneserNey Implementation, and Emotion Incorporation in Standard Bigram Probabilities

In [90]:
import numpy as np
import pandas as pd
from collections import defaultdict
import pickle
import warnings
warnings.filterwarnings('ignore')

class BigramLM:
    def __init__(self):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.bigramDictionary = defaultdict(int)
        self.bigramProbabilities = defaultdict(int)
        self.laplaceProbabilities = defaultdict(int)
        self.kneserneyProbabilities = defaultdict(int)
        self.vocab = set()
        self.discount = 0.75               # Kneser Ney Discount Factor
        self.probability_comparison_frame = pd.DataFrame()
        self.probability_comparison_frame_available = pd.DataFrame()
        self.bigram_emotion_dictionary = defaultdict(list)

    # Task2: Subtask1: method to learn the model
    def learn_model(self, dataset):
        for sentence in dataset:
            tokens = sentence
            tokens.append('$')  # Adding End of Sentence marker
            
            # Construct a bigram and unigram counts dictionary for further reference
            for i in range(len(tokens)):
                prev_token = tokens[i - 1]
                current_token = tokens[i]

                self.bigram_counts[prev_token][current_token] += 1
                self.unigram_counts[prev_token] += 1
                self.vocab.add(prev_token)
                self.bigramDictionary[(prev_token, current_token)] += 1

        self.calculate_probability()
        # self.add_emotion_component_to_bigram_probabilities() # Commenting as already created and stored using pickle
        self.import_stored_pickle_file()
#         self.build_laplace_and_kneserNey_probabilities()
    
    def build_laplace_and_kneserNey_probabilities(self):
        for bigram in self.bigramDictionary:
            self.laplaceProbabilities[bigram] = self.laplace_smoothing_probability(bigram)
            self.kneserneyProbabilities[bigram] = self.kneserney_smoothing_probability(bigram)
            
    # Function to calculate the emotion-oriented bigram probability
    def emotion_bigram_probability(self, bigram, emotion, beta):
        # Get emotion scores for the current word
        # emotion_scores_wi = emotion_scores(bigram[0]+bigram[1])

        # Calculate the emotion component (beta) based on the desired emotion (e.g., joy)
        # emotion_factor = emotion_scores_wi[emotion] if emotion in emotion_scores_wi else 0.0
        
        emotion_factor = 0.5

        for d in self.bigram_emotion_dictionary[bigram][1]:
            if d['label'] == emotion:
                emotion_factor = d['score']
                break

        # Calculate the emotion-oriented bigram probability
        emotion_oriented_bigram_probability = (self.bigramProbabilities[bigram]) + (beta * emotion_factor)
        return emotion_oriented_bigram_probability
    
    # Function to construct a sepearte dictionary with bigram_probabilty scores and corresponding emotion scores of each bigram
    # and later saving it to pickle for future reference. this function will only be called once and later saved pickle file will be used 
    def add_emotion_component_to_bigram_probabilities(self):
        for bigram in self.bigramProbabilities:
            self.bigram_emotion_dictionary[bigram] = [self.bigramProbabilities[bigram], emotion_scores(bigram[0]+" "+bigram[1])]
        # Save the dictionary to a file using pickle
        with open("bigram_emotion_dictionary.pkl", 'wb') as file:
            pickle.dump(self.bigram_emotion_dictionary, file)
     
    # Function to import previously saved pickle file, in previous defined function
    def import_stored_pickle_file(self):
        # Load the dictionary from the saved file
        with open("bigram_emotion_dictionary.pkl", 'rb') as file:
            self.bigram_emotion_dictionary = pickle.load(file)
      
    # Generate the most probable next token given a specific emotion
    def generate_next_token_by_emotion(self, prev_token, emotion, beta=0.1):
        if prev_token in self.bigram_counts:
            next_tokens = list(self.bigram_counts[prev_token].keys())
            probabilities = np.array([self.emotion_bigram_probability((prev_token, token), emotion, beta) for token in next_tokens])
            
            probabilities = probabilities/probabilities.sum()
            next_token = np.random.choice(next_tokens, p=probabilities)
            return next_token

        return None

    # Function for generating samples corresponding to a specific emotion (e.g., joy)
    def generate_emotion_oriented_samples(self, emotion, beta=0.1, num_samples=5, start_token="$"):
        if len(start_token.split()) ==0:
            start_token = "$"    
        start_token = start_token.split()[-1]
        
        if num_samples>300:
            print("Can generate only upto 300 samples")
            
#         if num_samples<=50:
#             iterations = num_samples*6
#         else:
#             iterations = 300

        iterations = num_samples
        
        # First store all the generated sentences in list
        generated_sentences = []
        
        # For each iteration, generate 1 sentence
        for _ in range(iterations):
            sample_list = []
            prev_token = start_token
            for i in range(10):
                next_token = self.generate_next_token_by_emotion(prev_token, emotion, beta)
                print(prev_token, end=" ")
                sample_list.append(prev_token)
                prev_token = next_token
            print()
            generated_sentences.append(" ".join(sample_list))
            
#         top_k = self.get_top_sentences_by_key(generated_sentences, emotion, num_samples)

        try:
            with open(f"{emotion}.txt", 'w') as file:
                pass  # Doing nothing inside the block, just opening in write mode to clear content
        except Exception as e:
            print(f"Error clearing file: {e}")
        # Save the generated sentences to corresponding text files
        for element in generated_sentences:
            print(element)
            try:
                # Open the file in append mode
                with open(f"{emotion}.txt", 'a') as file:
                    # Append the string to the end of the file
                    file.write(element + '\n')
            except Exception as e:
                print(f"Error writing sample to file: {e}")      
        print()
    
    # Rank top 50 sentences by their specific emotion scores and return  
    def get_top_sentences_by_key(self, input_sentences, emotion, k):
        # Apply the black-box function to each sentence and create a list of dictionaries
        sample_level_scores = dict()
        for sentence in input_sentences:
            escores = emotion_scores(sentence)
            escore = 0.5 
            for d in escores:
                if d['label']==emotion:
                    escore = d['score']
                    break
            sample_level_scores[sentence] = escore    
        sorted_items = sorted(sample_level_scores.items(), key=lambda x: x[1], reverse=True)  
        top_keys = [key for key, value in sorted_items[:k]]
        return top_keys

    # Generate the bigram dictionary     
    def calculate_probability(self):
        for bigram in self.bigramDictionary:
            self.bigramProbabilities[bigram] = (self.bigramDictionary[bigram]) / (self.unigram_counts[bigram[0]])
    
    # Task 2 : SubTask 2: Implement laplace smoothing
    def laplace_smoothing_probability(self, bigram):
        prefix_count = self.unigram_counts[bigram[0]]
        if bigram not in self.bigramDictionary:
            return ((1)/(prefix_count+len(self.vocab))) 
        
        bigram_count = self.bigramDictionary[bigram]
        return ((bigram_count+1)/(prefix_count+len(self.vocab)))      
        
    # Task 2 : SubTask 2: Implement Kneser Ney smoothing
    def kneserney_smoothing_probability(self, bigram):
        prefix_count = self.unigram_counts[bigram[0]]
        bigram_count = 0
        if bigram in self.bigramDictionary:
            bigram_count = self.bigramDictionary[bigram]
        
        bigram_types_with_suffix = len([x for x in self.bigramDictionary if x[1]==bigram[1]]) # fixed suffix, variable prefix(wi-1)
        bigram_types_with_prefix = len([x for x in self.bigramDictionary if x[0]==bigram[0]]) # fixed prefix, variable suffix(wi)
        total_bigram_types = len(self.bigramDictionary)
        
        discounted_prob = max(bigram_count - self.discount, 0) / prefix_count
        alpha_parameter = (self.discount / prefix_count) * bigram_types_with_prefix
        pcontinuation = bigram_types_with_suffix / total_bigram_types
        
        return discounted_prob + alpha_parameter * pcontinuation
           
    # Generate next token by standard bigram probability 
    def generate_next_token(self, prev_token):
        if prev_token in self.bigram_counts:
            next_tokens = list(self.bigram_counts[prev_token].keys())
            probabilities = [self.bigramProbabilities[(prev_token, token)] for token in next_tokens]
            next_token = np.random.choice(next_tokens, p=probabilities)
            return next_token

        return None

    # Generate next token by Laplace probability
    def generate_next_token_using_laplace(self, prev_token):
        next_tokens = [token for token in self.unigram_counts]
        
        probabilities = np.array([self.laplace_smoothing_probability((prev_token, token)) for token in next_tokens])
        
        next_token = np.random.choice(next_tokens, p=probabilities)
        return next_token

    # Generate next token by Kneser Ney probability
    def generate_next_token_using_kneserney(self, prev_token):
        next_tokens = [token for token in self.unigram_counts]
        
        probabilities = np.array([self.kneserney_smoothing_probability((prev_token, token)) for token in next_tokens])
        next_token = np.random.choice(next_tokens, p=probabilities)
        return next_token
        
    # Generate given number of sentences by standard bigram porbabilty
    def generate_sentences_standard_bigram(self, num_samples, start_token="$"):
        if len(start_token.split()) ==0:
            start_token = "$"
            
        start_token = start_token.split()[-1]
        for x in range(num_samples):
            prev_token = start_token
            for i in range(10):
                next_token = self.generate_next_token(prev_token)
                print(prev_token, end=" ")
                prev_token = next_token
            print()
        print()
        
    # Generate given number of sentences by Laplace porbabilty
    def generate_sentences_laplace(self, num_samples, start_token="$"):
        if len(start_token.split()) ==0:
            start_token = "$"
            
        start_token = start_token.split()[-1]
        for x in range(num_samples):
            prev_token = start_token
            for i in range(10):
                next_token = self.generate_next_token_using_laplace(prev_token)
                print(prev_token, end=" ")
                prev_token = next_token
            print()
        print()
        
    # Generate given number of sentences by KneserNey porbabilty        
    def generate_sentences_kneserney(self, num_samples, start_token="$"):
        if len(start_token.split()) ==0:
            start_token = "$"
            
        start_token = start_token.split()[-1]
        for x in range(num_samples):
            prev_token = start_token
            for i in range(10):
                next_token = self.generate_next_token_using_kneserney(prev_token)
                print(prev_token, end=" ")
                prev_token = next_token
            print()
        print()
       
    # Task2: Subtask2 : Function to compare and return the Laplace and kneserNey probabilities
    def compare_probabilities(self, prev_token="$"):
        if len(prev_token.split()) ==0:
            prev_token = "$"
            
        prev_token = prev_token.split()[-1]
        if prev_token in self.bigram_counts:
            available_next_tokens = list(self.bigram_counts[prev_token].keys())
            probabilities_bigram = [self.bigramProbabilities[(prev_token, token)] for token in available_next_tokens]
            
            all_tokens = np.array([token for token in self.unigram_counts])
            probabilities_laplace = np.array([self.laplace_smoothing_probability((prev_token, token)) for token in self.unigram_counts])
            probabilities_kneserney = np.array([self.kneserney_smoothing_probability((prev_token, token)) for token in self.unigram_counts])

            # Set display options for float formatting
            pd.set_option('display.float_format', '{:.8f}'.format)
            # Create Probability DataFrame
            self.probability_comparison_frame = pd.DataFrame({
                'Token': all_tokens,
                'Laplace Probability': probabilities_laplace,
                'KneserNey Probability': probabilities_kneserney
            })
            
            self.probability_comparison_frame_available = self.probability_comparison_frame[self.probability_comparison_frame["Token"].isin(available_next_tokens)]
            self.probability_comparison_frame_available["Bigram Probability"] = probabilities_bigram
            print("Probability Comparison of All Tokens")
            print(self.probability_comparison_frame)
            
            print("\nProbability Comparison of Next Available Tokens in Bigram Dictionary")
            print(self.probability_comparison_frame_available)
            return

        return None

with open("corpus.txt", "r", encoding="utf-8") as file:
    corpus = [line.strip().split() for line in file] 

bigram_model = BigramLM()
bigram_model.learn_model(corpus)

## Task2 : Subtask 1: Generate some sentences

In [9]:
# generate_sentences_standard_bigram(number_of_samples, start_token->optional)
bigram_model.generate_sentences_standard_bigram(10)

$ i feel ecstatic and regard to a particularly mad 
$ i am earning again this triangle $ i feel 
$ im in the fierce looking for hussein whom i 
$ i m going to make some sort of my 
$ i could better on me more confident a purchase 
$ i feel the perfect fit all the fierce feels 
$ i like $ i get this time in high 
$ im really nice treat $ im not achieved and 
$ im suppose its so im feeling like this story 
$ i feel very loved $ i am a way 



In [10]:
# generate_sentences_laplace(number_of_samples, start_token->optional)
bigram_model.generate_sentences_laplace(10)

$ nude advice tend waking seek whats holy gosh sharing 
$ apocalypses chocolating downright oblige disbelieving various accustomed outraged stranger 
$ unusual if impostor roads demon generational aa shower hold 
$ buffed frizz trials forest recently increasingly goes space sale 
$ i can fandoms zenos putting warfighter distinct comfort disgusting 
$ i feel facebook devoted burgeoning wimpy feel all numbing 
$ i don southern midori serving fell news seen wondering 
$ hart greater every depending child arun renowned desperate close 
$ unimportant checked fangirls artisan army exhausted facebook tasks uncoiling 
$ vibe securities homis pine radiator vunerable bringing inspiring retrospect 



In [31]:
# generate_sentences_kneserney(number_of_samples, start_token->optional)
bigram_model.generate_sentences_kneserney(10)

$ i loki song considering mistake as the culprit death 
$ i just whats cleaning you know what i can 
$ i m about the down to others watched the 
$ i really dramatically for minds $ i have to 
$ i felt at my to sit it bounces lately 
$ i can way it table s to right now 
$ i just miss the universe am desperately i started 
$ i feel so im feeling a bit to do 
$ i am $ i am feeling rather than i 
$ i have found this isnt concerns to not idea 



## Task2 : Subtask 2: Compare Laplace and KneserNey

In [58]:
bigram_model.compare_probabilities()

Probability Comparison of All Tokens
           Token  Laplace Probability  KneserNey Probability
0              $           0.00012771             0.00023928
1              i           0.26934866             0.87810280
2          stand           0.00012771             0.00000156
3           here           0.00012771             0.00000623
4           feel           0.00012771             0.00002181
...          ...                  ...                    ...
5425      google           0.00012771             0.00000019
5426  stellarium           0.00012771             0.00000019
5427       theyd           0.00012771             0.00000019
5428       peter           0.00012771             0.00000019
5429      robbed           0.00012771             0.00000019

[5430 rows x 3 columns]

Probability Comparison of Next Available Tokens in Bigram Dictionary
        Token  Laplace Probability  KneserNey Probability  Bigram Probability
1           i           0.26934866             0.87810280 

## Argument:

As we can see, their is a huge difference in Laplace probabilities and original Bigram probabilities. This is because Laplace smoothing steals a large amount of probabilities from non-zero counts of tokens to distribute it into tokens with zero occurrences. Thus in Laplace smoothing, the reconstructed counts of non-zero tokens could change largely sometimes by a factor of 10 from their original counts. For example, token "i" has a Laplace probability of 0.269 compared to its original probability of 0.87 i.e. a 1/3rd of difference from original probability while KneserNey has somewhat managed to maintain the probability in proportion with original probability.
Thus Laplace haven't worked in our Bigram model well and doesn't work well in general for n-grams.

On the other hand, Kneser-Ney has somewhat maintained the probabilities with original Bigram probabilities compared to Laplace.

Thus KneserNey smoothing is better than Laplace in this case

## Task2 : Subtask 3: Output

In [11]:
# Checkign the bigram emotion dictionary that we build
bigram_model.bigram_emotion_dictionary

defaultdict(list,
            {('$', 'i'): [0.8783333333333333,
              [{'label': 'sadness', 'score': 0.10423965752124786},
               {'label': 'joy', 'score': 0.22589899599552155},
               {'label': 'love', 'score': 0.009581828489899635},
               {'label': 'anger', 'score': 0.5985594391822815},
               {'label': 'fear', 'score': 0.05375984311103821},
               {'label': 'surprise', 'score': 0.007960204035043716}]],
             ('i', 'stand'): [0.0005278437582475588,
              [{'label': 'sadness', 'score': 0.03286624699831009},
               {'label': 'joy', 'score': 0.6205080151557922},
               {'label': 'love', 'score': 0.005959461908787489},
               {'label': 'anger', 'score': 0.3174057602882385},
               {'label': 'fear', 'score': 0.02032058872282505},
               {'label': 'surprise', 'score': 0.002939873840659857}]],
             ('stand', 'here'): [0.1,
              [{'label': 'sadness', 'score': 0.01706150360

# Task 2: Subtask 4: Extrinsic Evaluation
### Part A: Generateing 50 samples for each emotion and save them in text files simultaneously

In [99]:
# Initially define how many emotions we have
emotions = ["sadness", "joy", "love", "anger", "fear", "surprise"]

# For each probable emotion, call the generate_emotion_oriented_samples(emotion, beta, num_of_samples) method from class
# BigramLM() to generate 50 samples for each emotion and save them in .txt files
for emotion_ in emotions:
    print("\nSamples for ", emotion_)
    bigram_model.generate_emotion_oriented_samples(emotion_, 0.75, 50)


Samples for  sadness
$ a dark getting a retired wedding photographer $ i 
$ occured while $ during a synonym for an apology 
$ i often go out $ during the need drugs 
$ i suffer i is fake people can put upon 
$ ill also now unsuccessful $ i lack of color 
$ i split up on overcoming the loss of pathetic 
$ in stores because as worthless person $ i miss 
$ when the sentence in finance as time of all 
$ im usually even after my future $ i left 
$ ill also ached $ id an unwelcome in wide 
$ occured while $ ill get disturbed that suffer the 
$ on its inadequate or forget about rabbits i start 
$ ive never being hurt you always am left me 
$ i blanked a class the whitley family $ i 
$ i view when i exhausted all unloved $ i 
$ im aching for netflix as worthless person as ugly 
$ this pregnancy $ ill screw over reacting over mom 
$ ill get on the minority in new $ on 
$ on each and at peace love eating fresh new 
$ ill laugh because as time to sleep i exhausted 
$ i forgot my toes and start 

$ i clung onto them $ i shared and art 
$ i wished would be loving kindness allow us at 
$ i liked doing something lovely yarn and caring who 
$ i love and all hot radiator $ i desire 
$ i wished would love me $ i liked how 
$ occured while everyone else was so devoted my love 
$ i need to people in love of designer shoes 
$ i feel beloved your lovely now im craving i 
$ im craving addiction to kiss me $ i liked 
$ i thank him to whether youre feeling that passionate 
$ i admire her tender about supporting other and since 
$ im still getting credit equals getting caught my lovely 
$ i admire her close enough $ i was hot 
$ i admire her tender little tender smile amused smile 
$ ive learned a gentle rippling through loving simple shape 
$ i liked the delicate and resistant for caring who 
$ i admire her close enough about anime fandom in 
$ ill desperately fond memories $ i empathize with the 
$ i love the loyal reading viewing the cross cynical 
$ i empathize with you believe there $ i

$ the surface giving them meditating with how distraught he 
$ i wondered why i floated in outfit $ i 
$ i so many amazing chapter in zambia $ i 
$ ive been needing quite unclear on my skin i 
$ i found herself in oregon $ i must always 
$ i saw a room to droop and wound up 
$ im constantly paranoid that our progress $ i grabbed 
$ during most important to reflect on their feelings of 
$ im around boys this final one girls in zombie 
$ id know f t felt $ in washington so 
$ i visited the a plane to effect real life 
$ i woke up my footing and actually doing something 
$ im around these because arun has somehow not curl 
$ i turned out fearful anxiety is slowly killing me 
$ i stopped feeling shaken as online who is that 
$ i sensed he get confused by to dubstep when 
$ i almost flat $ i recall being married i 
$ i wondered would cause of ideas how desperately curious 
$ when someone did laps and went down when conversing 
$ i floated in stores because if im down when 
$ i caught my thi

# SVC TFIDF
### Part B and Part C

In [100]:
emotion_data = {}
for emotion in emotions:            
    with open(f"{emotion}.txt", "r", encoding="utf-8") as file:
        emotion_data[f"{emotion}_samples"] = [line.strip() for line in file]

In [101]:
emotion_data

{'sadness_samples': ['$ a dark getting a retired wedding photographer $ i',
  '$ occured while $ during a synonym for an apology',
  '$ i often go out $ during the need drugs',
  '$ i suffer i is fake people can put upon',
  '$ ill also now unsuccessful $ i lack of color',
  '$ i split up on overcoming the loss of pathetic',
  '$ in stores because as worthless person $ i miss',
  '$ when the sentence in finance as time of all',
  '$ im usually even after my future $ i left',
  '$ ill also ached $ id an unwelcome in wide',
  '$ occured while $ ill get disturbed that suffer the',
  '$ on its inadequate or forget about rabbits i start',
  '$ ive never being hurt you always am left me',
  '$ i blanked a class the whitley family $ i',
  '$ i view when i exhausted all unloved $ i',
  '$ im aching for netflix as worthless person as ugly',
  '$ this pregnancy $ ill screw over reacting over mom',
  '$ ill get on the minority in new $ on',
  '$ on each and at peace love eating fresh new',
  '$ i

In [102]:
# Combining Data
generated_samples = []
generated_labels = []


for emotion in emotion_data:
    for emotion_sample in emotion_data[emotion]:             
        generated_samples.append(emotion_sample)
        generated_labels.append(emotion.split("_")[0])
    

In [103]:
# Check the generated samples retreieved from each of <emotion>.txt files
generated_samples

['$ a dark getting a retired wedding photographer $ i',
 '$ occured while $ during a synonym for an apology',
 '$ i often go out $ during the need drugs',
 '$ i suffer i is fake people can put upon',
 '$ ill also now unsuccessful $ i lack of color',
 '$ i split up on overcoming the loss of pathetic',
 '$ in stores because as worthless person $ i miss',
 '$ when the sentence in finance as time of all',
 '$ im usually even after my future $ i left',
 '$ ill also ached $ id an unwelcome in wide',
 '$ occured while $ ill get disturbed that suffer the',
 '$ on its inadequate or forget about rabbits i start',
 '$ ive never being hurt you always am left me',
 '$ i blanked a class the whitley family $ i',
 '$ i view when i exhausted all unloved $ i',
 '$ im aching for netflix as worthless person as ugly',
 '$ this pregnancy $ ill screw over reacting over mom',
 '$ ill get on the minority in new $ on',
 '$ on each and at peace love eating fresh new',
 '$ ill laugh because as time to sleep i exh

In [104]:
# Check the corresponding generated labels
generated_labels

['sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',
 'joy',


In [105]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Load the original corpus and labels
with open('corpus.txt', 'r', encoding='utf-8') as file:
    corpus = file.read().splitlines()

with open('labels.txt', 'r', encoding='utf-8') as file:
    labels = file.read().splitlines()

# Assuming 'generated_samples' and 'generated_labels' are lists containing the generated samples and labels
# X_train, X_test, y_train, y_test = train_test_split(corpus + generated_samples, labels + generated_labels, test_size=0.2, random_state=42)


# Define the pipeline with TF-IDF vectorizer and SVC classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC())
])

# # Define parameter grid for Grid Search
# param_grid = {
#     'tfidf__max_features': [5000, 10000, 20000],
#     'svc__C': [0.1, 1, 10],
#     'svc__kernel': ['linear', 'rbf']
# }

# Define parameter grid for Grid Search
param_grid = {
    'tfidf__max_features': [5000, 10000, 20000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Adjust n-gram range
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto', 0.1, 1.0],  # Adjust gamma
    'svc__class_weight': [None, 'balanced']  # Adjust class weight
}

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=10, n_jobs=-1)
# Train
X_train = corpus
y_train = labels
X_test = generated_samples
y_test = generated_labels

grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

Best Parameters: {'svc__C': 10, 'svc__class_weight': None, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}
Best Cross-Validated Score: 0.765


In [106]:


# Use the best model to predict on the test set
y_pred_by_model = grid_search.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred_by_model)
classification_rep = classification_report(y_test, y_pred_by_model)

print("\nAccuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)


Accuracy: 0.5733333333333334

Classification Report:
               precision    recall  f1-score   support

       anger       0.47      0.30      0.37        50
        fear       0.63      0.54      0.58        50
         joy       0.36      0.40      0.38        50
        love       0.86      0.84      0.85        50
     sadness       0.42      0.72      0.53        50
    surprise       0.91      0.64      0.75        50

    accuracy                           0.57       300
   macro avg       0.61      0.57      0.58       300
weighted avg       0.61      0.57      0.58       300



### Task 2: Report Submittables 

In [72]:
bigram_model.bigramProbabilities

defaultdict(int,
            {('$', 'i'): 0.8783333333333333,
             ('i', 'stand'): 0.0005278437582475588,
             ('stand', 'here'): 0.1,
             ('here', 'i'): 0.16216216216216217,
             ('i', 'feel'): 0.2684085510688836,
             ('feel', 'empty'): 0.0012217470983506415,
             ('empty', 'a'): 0.1111111111111111,
             ('a', 'class'): 0.001091703056768559,
             ('class', 'post'): 0.07692307692307693,
             ('post', 'count'): 0.07142857142857142,
             ('count', 'link'): 0.25,
             ('link', 'href'): 0.5,
             ('href', 'http'): 1.0,
             ('http', 'mooshilu'): 0.03333333333333333,
             ('mooshilu', '$'): 1.0,
             ('i', 'literally'): 0.0002639218791237794,
             ('literally', 'just'): 0.25,
             ('just', 'text'): 0.004424778761061947,
             ('text', 'tychelle'): 0.3333333333333333,
             ('tychelle', 'to'): 1.0,
             ('to', 'see'): 0.01119402985074

In [74]:
print("\nTop 5 Bigrams before Smoothing: ")
top_keys = sorted(bigram_model.bigramProbabilities, key=lambda k: bigram_model.bigramProbabilities[k], reverse=True)[:5]
for x in top_keys:
    print(x, bigram_model.bigramProbabilities[x])

print("\nTop 5 Bigrams after Laplace Smoothing: ")
top_keys = sorted(bigram_model.laplaceProbabilities, key=lambda k: bigram_model.laplaceProbabilities[k], reverse=True)[:5]
for x in top_keys:
    print(x, bigram_model.laplaceProbabilities[x])

print("\nTop 5 Bigrams after Kneser-Ney Smoothing: ")
top_keys = sorted(bigram_model.kneserneyProbabilities, key=lambda k: bigram_model.kneserneyProbabilities[k], reverse=True)[:5]
for x in top_keys:
    print(x, bigram_model.kneserneyProbabilities[x])



Top 5 Bigrams before Smoothing: 
('href', 'http') 1.0
('mooshilu', '$') 1.0
('tychelle', 'to') 1.0
('hang', 'out') 1.0
('nonexistent', 'social') 1.0

Top 5 Bigrams after Laplace Smoothing: 
('$', 'i') 0.2693486590038314
('i', 'feel') 0.11042412409155006
('feel', 'like') 0.035092684307343996
('i', 'am') 0.03189066059225513
('$', 'im') 0.027203065134099615

Top 5 Bigrams after Kneser-Ney Smoothing: 
('don', 't') 0.970358782691682
('href', 'http') 0.9700023363576185
('didn', 't') 0.9583657827447011
('sort', 'of') 0.9565093900961343
('supposed', 'to') 0.9183832405280168
