In [58]:
import argparse
from itertools import product
import math
import nltk
import os
from pathlib import Path

SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"

In [59]:
def add_sentence_tokens(sentences, n):
    
    sos = SOS * (n-1) if n > 1 else SOS
    return ['{}{} {}'.format(sos, s, EOS) for s in sentences]


In [60]:
def replace_singletons(tokens):
    vocab = nltk.FreqDist(tokens)
    return [token if vocab[token] > 1 else UNK for token in tokens]

In [61]:
def preprocess(sentences, n):
    sentences = add_sentence_tokens(sentences, n)
    tokens = ' '.join(sentences).split(' ')
    tokens = replace_singletons(tokens)
    return tokens

In [62]:
def load_data(data_dir):
    train_path = os.path.join(
        data_dir, 'train.txt')  # data_dir.join('train.txt')
    test_path = os.path.join(
        data_dir, 'test.txt')  # data_dir.join('test.txt')

    with open(train_path, 'r', encoding='utf-8') as f:
        train = [l.strip() for l in f.readlines()]
    with open(test_path, 'r', encoding='utf-8') as f:
        test = [l.strip() for l in f.readlines()]
    return train, test



In [63]:
class LanguageModel(object):

    def __init__(self, train_data, n, laplace=1):
        self.n = n
        self.laplace = laplace
        self.tokens = preprocess(train_data, n)
        self.vocab = nltk.FreqDist(self.tokens)
        self.model = self._create_model()
        self.masks = list(reversed(list(product((0, 1), repeat=n))))

    def _smooth(self):
        
        vocab_size = len(self.vocab)

        n_grams = nltk.ngrams(self.tokens, self.n)
        n_vocab = nltk.FreqDist(n_grams)

        m_grams = nltk.ngrams(self.tokens, self.n-1)
        m_vocab = nltk.FreqDist(m_grams)

        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = m_vocab[m_gram]
            return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)

        return {n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items()}

    def _create_model(self):
        
        if self.n == 1:
            num_tokens = len(self.tokens)
            return {(unigram,): count / num_tokens for unigram, count in self.vocab.items()}
        else:
            return self._smooth()

    def _convert_oov(self, ngram):
        def mask(ngram, bitmask): return tuple(
            (token if flag == 1 else "<UNK>" for token, flag in zip(ngram, bitmask)))

        ngram = (ngram,) if type(ngram) is str else ngram
        for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]:
            if possible_known in self.model:
                return possible_known

    def perplexity(self, test_data):
        
        test_tokens = preprocess(test_data, self.n)
        test_ngrams = nltk.ngrams(test_tokens, self.n)
        N = len(test_tokens)

        known_ngrams = (self._convert_oov(ngram) for ngram in test_ngrams)
        probabilities = [self.model[ngram] for ngram in known_ngrams]

        return math.exp((-1/N) * sum(map(math.log, probabilities)))

    def _best_candidate(self, prev, i, without=[]):
        
        blacklist = ["<UNK>"] + without
        candidates = ((ngram[-1], prob) for ngram,
                      prob in self.model.items() if ngram[:-1] == prev)
        candidates = filter(
            lambda candidate: candidate[0] not in blacklist, candidates)
        candidates = sorted(
            candidates, key=lambda candidate: candidate[1], reverse=True)
        if len(candidates) == 0:
            return ("</s>", 1)
        else:
            return candidates[0 if prev != () and prev[-1] != "<s>" else i]

    def generate_sentences(self, num, min_len=12, max_len=24):

        for i in range(num):
            sent, prob = ["<s>"] * max(1, self.n-1), 1
            while sent[-1] != "</s>":
                prev = () if self.n == 1 else tuple(sent[-(self.n-1):])
                blacklist = sent + (["</s>"] if len(sent) < min_len else [])
                next_token, next_prob = self._best_candidate(
                    prev, i, without=blacklist)
                sent.append(next_token)
                prob *= next_prob

                if len(sent) >= max_len:
                    sent.append("</s>")

            yield ' '.join(sent), -1/math.log(prob)


In [64]:
#1-gram
# Load and prepare train/test data
data_path = r"C:\Users\Bot\PycharmProjects\NLP\data1"
train, test = load_data(data_path)

print("{}-gram model...".format(1))
lm = LanguageModel(train, 1)
print("Vocabulary size: {}".format(len(lm.vocab)))

print("Generating sentences...")
for sentence, prob in lm.generate_sentences(10):
    print("{} ({:.5f})".format(sentence, prob))

perplexity = lm.perplexity(test)
print("Model perplexity: {:.3f}".format(perplexity))
print("")


1-gram model...
Vocabulary size: 11646
Generating sentences...
<s> the to of and a that in I is we you </s> (0.02094)
<s> to of and a that in I is we you And the it this was for are have with on -- they about </s> (0.00934)
<s> of and a that in I is we you And it to this was for are have with on -- they about be </s> (0.00914)
<s> and a that in I is we you And it this of was for are have with on -- they about be my </s> (0.00899)
<s> a that in I is we you And it this was and for are have with on -- they about be my not </s> (0.00884)
<s> that in I is we you And it this was for a are have with on -- they about be my not as </s> (0.00871)
<s> in I is we you And it this was for are that have with on -- they about be my not as can </s> (0.00858)
<s> I is we you And it this was for are have in with on -- they about be my not as can at </s> (0.00847)
<s> is we you And it this was for are have with I on -- they about be my not as can at our </s> (0.00837)
<s> we you And it this was for are ha

In [65]:
#2-gram
data_path = r"C:\Users\Bot\PycharmProjects\NLP\data1"
train, test = load_data(data_path)

print("{}-gram model...".format(2))
lm = LanguageModel(train, 2)
print("Vocabulary size: {}".format(len(lm.vocab)))

print("Generating sentences...")
for sentence, prob in lm.generate_sentences(10):
    print("{} ({:.5f})".format(sentence, prob))

perplexity = lm.perplexity(test)
print("Model perplexity: {:.3f}".format(perplexity))
print("")

2-gram model...
Vocabulary size: 11646
Generating sentences...
<s> And I was a lot of the world is that we can be able to do you </s> (0.01314)
<s> So I was a lot of the world is that we can be able to do you </s> (0.01291)
<s> I was a lot of the world is that we can be able to do you </s> (0.01372)
<s> But I was a lot of the world is that we can be able to do you </s> (0.01279)
<s> We have to the world is a lot of our own country </s> (0.01584)
<s> The first time I was a lot of the world is that we can be able to do you </s> (0.01105)
<s> " And I was a lot of the world is that we can be able to do you </s> (0.01215)
<s> It was a lot of the world is that we can be able to do you </s> (0.01338)
<s> It's a lot of the world is that we can be able to do you </s> (0.01409)
<s> They were the world is a lot of our own country </s> (0.01580)
Model perplexity: 385.890



In [66]:
#3-gram
data_path = r"C:\Users\Bot\PycharmProjects\NLP\data1"
train, test = load_data(data_path)

print("{}-gram model...".format(3))
lm = LanguageModel(train, 3)
print("Vocabulary size: {}".format(len(lm.vocab)))

print("Generating sentences...")
for sentence, prob in lm.generate_sentences(10):
    print("{} ({:.5f})".format(sentence, prob))

perplexity = lm.perplexity(test)
print("Model perplexity: {:.3f}".format(perplexity))
print("")

3-gram model...
Vocabulary size: 11646
Generating sentences...
<s> <s> And I think we need to do with the same </s> (0.01530)
<s> <s> So I was a little bit of an entire year </s> (0.01387)
<s> <s> I was a little bit of an entire year </s> (0.01738)
<s> <s> But I think we need to do with the same </s> (0.01469)
<s> <s> We have to be a better world for themselves and posting them under the surface of planets like Mars </s> (0.00659)
<s> <s> The first time in a world where the sea level </s> (0.01305)
<s> <s> " And I think we need to do with the same </s> (0.01347)
<s> <s> It was a little bit of an entire year </s> (0.01703)
<s> <s> It's a very long time and energy to actually pay attention </s> (0.01103)
<s> <s> They have a lot of people who are in the world </s> (0.01347)
Model perplexity: 760.479



In [57]:
#2-gram with laplacing 0.0001
data_path = r"C:\Users\Bot\PycharmProjects\NLP\data1"
train, test = load_data(data_path)

print("{}-gram model...".format(2))
lm = LanguageModel(train, 2, laplace=0.0001)
#lm = LanguageModel(train, 2)
print("Vocabulary size: {}".format(len(lm.vocab)))

print("Generating sentences...")
for sentence, prob in lm.generate_sentences(10):
    print("{} ({:.5f})".format(sentence, prob))

perplexity = lm.perplexity(test)
print("Model perplexity: {:.3f}".format(perplexity))
print("")

2-gram model...
Vocabulary size: 11646
Generating sentences...
<s> And I was a lot of the world is that we can be able to do you </s> (0.02363)
<s> So I was a lot of the world is that we can be able to do you </s> (0.02327)
<s> I was a lot of the world is that we can be able to do you </s> (0.02448)
<s> But I was a lot of the world is that we can be able to do you </s> (0.02297)
<s> We have to the world is a lot of our own country </s> (0.02895)
<s> The first time I was a lot of the world is that we can be able to do you </s> (0.02047)
<s> " And I was a lot of the world is that we can be able to do you </s> (0.02201)
<s> It was a lot of the world is that we can be able to do you </s> (0.02453)
<s> It's a lot of the world is that we can be able to do you </s> (0.02571)
<s> They were the world is a lot of our own country </s> (0.02923)
Model perplexity: 37.851

