In [140]:
from functools import reduce

In [49]:
corpus = [
    "I am Subhranil", 
    "Subhranil I am",
    "I do not like green eggs and ham"
]

In [81]:
def augment_sentence(corpus):
    return [f"<s> {sent} </s>" for sent in corpus]

In [82]:
aug_corpus = augment_sentence(corpus)
aug_corpus

['<s> I am Subhranil </s>',
 '<s> Subhranil I am </s>',
 '<s> I do not like green eggs and ham </s>']

In [136]:
def get_unigrams(aug_corpus):
    return reduce(lambda x, y: x + y, [i.split(" ") for i in aug_corpus])

In [137]:
unigrams = get_unigrams(aug_corpus)
print(unigrams)

['<s>', 'I', 'am', 'Subhranil', '</s>', '<s>', 'Subhranil', 'I', 'am', '</s>', '<s>', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '</s>']


In [151]:
def get_bigrams(aug_corpus):
    bigrams = []
    for sent in aug_corpus:
        (words := sent.split(" "))
        bigrams.extend([(i, j) for i, j in zip(words, words[1:])])
    return bigrams

In [152]:
bigrams = get_bigrams(aug_corpus)
print(bigrams)

[('<s>', 'I'), ('I', 'am'), ('am', 'Subhranil'), ('Subhranil', '</s>'), ('<s>', 'Subhranil'), ('Subhranil', 'I'), ('I', 'am'), ('am', '</s>'), ('<s>', 'I'), ('I', 'do'), ('do', 'not'), ('not', 'like'), ('like', 'green'), ('green', 'eggs'), ('eggs', 'and'), ('and', 'ham'), ('ham', '</s>')]


In [147]:
def p(w1, w2):
    return bigrams.count((w2, w1)) / unigrams.count(w2)

In [148]:
p('</s>', 'Subhranil')

0.5

In [149]:
p('Subhranil', 'am')

0.5

# Convert into a `Class`

In [158]:
class NGram:
    """
    N-Gram Model
    """
    
    def __init__(self, corpus):
        self.corpus = corpus
        self.aug_corpus = self._augment_corpus()
        self.unigrams = self._get_unigrams()
        self.bigrams = self._get_bigrams()
        
    def _augment_corpus(self):
        """
        Adding <s> and </s> to each sentence of the corpus
        """
        return [f"<s> {sent} </s>" for sent in self.corpus]
        
    
    def _get_unigrams(self):
        """
        Get Unigrams from a corpus
        """
        return reduce(lambda x, y: x + y, [i.split(" ") for i in self.aug_corpus])
    
    def _get_bigrams(self):
        """
        Get Bigrams from a corpus
        """
        bigrams = []
        for sent in aug_corpus:
            (words := sent.split(" "))
            bigrams.extend([(i, j) for i, j in zip(words, words[1:])])
        return bigrams

    def mle(self, w1, w2):
        """
        Calculate Maximum likelihood estimation
        """
        return self.bigrams.count((w2, w1)) / self.unigrams.count(w2)

In [155]:
n_gram = NGram(corpus)

In [160]:
n_gram.mle("I", "<s>")

0.6666666666666666