In [24]:
import numpy as np
from collections import defaultdict

class BigramLM:
    def __init__(self):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.bigramDictionary = defaultdict(int)
        self.bigramProbabilities = defaultdict(int)

    def learn_model(self, dataset):
        for sentence in dataset:
            tokens = sentence
            tokens.append('$')  # Adding End of Sentence marker
            for i in range(len(tokens)):
                prev_token = tokens[i - 1]
                current_token = tokens[i]

                self.bigram_counts[prev_token][current_token] += 1
                self.unigram_counts[prev_token] += 1
                self.bigramDictionary[(prev_token, current_token)] += 1
        self.calculate_probability()
        
    def calculate_probability(self):
        for bigram in self.bigramDictionary:
            self.bigramProbabilities[bigram] = (self.bigramDictionary[bigram]) / (self.unigram_counts[bigram[0]])

    def generate_next_token(self, prev_token):
        if prev_token in self.bigram_counts:
            next_tokens = list(self.bigram_counts[prev_token].keys())
            probabilities = [self.bigramProbabilities[(prev_token, token)] for token in next_tokens]
            
            # Choose the next token based on probabilities
            next_token = np.random.choice(next_tokens, p=probabilities)
            return next_token

        return None

with open("corpus.txt", "r", encoding="utf-8") as file:
    corpus = [line.strip().split() for line in file] 

bigram_model = BigramLM()
bigram_model.learn_model(corpus)

prev_token = "$"
for i in range(10):
    next_token = bigram_model.generate_next_token(prev_token)
    print(prev_token, end=" ")
    prev_token = next_token

$ i feel about their progeny who should be stubborn 