# Language Model Implementation  
reference  
https://www.kaggle.com/alvations/n-gram-language-model-with-nltk/notebook

In [1]:
from nltk.util import bigrams
from nltk.util import ngrams

In [6]:
# example corpus
text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

In [7]:
# test for bigram
list(bigrams(text[0]))

[('a', 'b'), ('b', 'c')]

In [8]:
# test for trigram
list(ngrams(text[1], n = 3))

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

In [9]:
# add padding for sequences
from nltk.util import pad_sequence
list(pad_sequence(text[0],
                 pad_left = True, left_pad_symbol = '<s>',
                 pad_right = True, right_pad_symbol = '</s>',
                 n = 2)) # The n order of n-grams, if it's 2-grams, you pad once, 3-grams pad twice, etc. 

['<s>', 'a', 'b', 'c', '</s>']

In [10]:
# padding sentence for bi-gram
padded_sent = list(pad_sequence(text[0],
                                pad_left = True, left_pad_symbol = '<s>',
                                pad_right = True, right_pad_symbol = '</s>',
                                n = 2))
list(ngrams(padded_sent, n = 2))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [11]:
# padding sentence for tri-gram
padded_sent = list(pad_sequence(text[0],
                                pad_left = True, left_pad_symbol = '<s>',
                                pad_right = True, right_pad_symbol = '</s>',
                                n = 3))
list(ngrams(padded_sent, n = 3))

[('<s>', '<s>', 'a'),
 ('<s>', 'a', 'b'),
 ('a', 'b', 'c'),
 ('b', 'c', '</s>'),
 ('c', '</s>', '</s>')]

In [12]:
# for convenience, we use the function pad_both_ends instead of pad_sequence
from nltk.lm.preprocessing import pad_both_ends
list(pad_both_ends(text[0], n = 2))

['<s>', 'a', 'b', 'c', '</s>']

In [13]:
# nltk provide a function to train everygram as unigram, bigram, trigram, etc.
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n = 2))
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('a',),
 ('b',),
 ('c',),
 ('</s>',),
 ('<s>', 'a'),
 ('a', 'b'),
 ('b', 'c'),
 ('c', '</s>')]

In [14]:
# to create the vocabulary we need to pad our sentences into one flat stream of words.
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n = 2) for sent in text))

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [15]:
# do the above thing with a pipeline
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)
print(list(train))
print(list(vocab))

[<generator object everygrams at 0x000001EB619222C8>, <generator object everygrams at 0x000001EB61922548>]
['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']


In [16]:
# So as to avoid re-creating the text in memory, both train and vocab are lazy iterators.
# They are evaluated on demand at training time.
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
for ngramlize_sent in training_ngrams:
    print(list(ngramlize_sent))
    print()
print('#############')
list(padded_sentences)

[('<s>',), ('a',), ('b',), ('c',), ('</s>',), ('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

[('<s>',), ('a',), ('c',), ('d',), ('c',), ('e',), ('f',), ('</s>',), ('<s>', 'a'), ('a', 'c'), ('c', 'd'), ('d', 'c'), ('c', 'e'), ('e', 'f'), ('f', '</s>')]

#############


['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

## Lets get some real data and tokenize it

In [17]:
try: # use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize
    # Testing whether it works.
    # sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # reference https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

In [20]:
print(list(word_tokenize))

TypeError: 'method' object is not iterable

In [21]:
import os
import requests
import io # codecs

# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    text = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)

In [5]:
print(text)

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a rela-
tion between two phenomena is demonstrably non-random, does not sup-
port the inference that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis test-
ing has been used, and show how it has often led to unhelpful or mislead-
ing results.
Keywords: 쎲쎲쎲

1. Int

In [22]:
# Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                 for sent in sent_tokenize(text)]

In [23]:
tokenized_text[0]

['language',
 'is',
 'never',
 ',',
 'ever',
 ',',
 'ever',
 ',',
 'random',
 'adam',
 'kilgarriff',
 'abstract',
 'language',
 'users',
 'never',
 'choose',
 'words',
 'randomly',
 ',',
 'and',
 'language',
 'is',
 'essentially',
 'non-random',
 '.']

In [26]:
print(text[:500])
print(len(tokenized_text))

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 
155


In [30]:
# preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

## Training an N-gram Model

In [27]:
from nltk.lm import MLE
model = MLE(n) # let train a 3-grams model, previously we set n = 3

In [28]:
# init the MLE model, creates an empty vocabulary
len(model.vocab)

0

In [31]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 1429 items>


In [32]:
len(model.vocab)

1429

In [33]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


In [34]:
# if we lookup the vocab on unseen sentences not from the training data.
# it automatically replace words not in the vocabulary with <UNK>
print(model.vocab.lookup('language is never random lah .'.split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


## Using the N-gram Language Model

In [35]:
print(model.counts)

<NgramCounter with 3 ngram orders and 18687 ngrams>


In [36]:
# count uni-gram
model.counts['language'] # i.e. Count('language')

25

In [37]:
# count bi-gram
model.counts[['language']]['is'] # i.e. Count('is'|'language')

11

In [38]:
# count tri-gram
model.counts[['language', 'is']]['never'] # i.e. Count('never' | 'language is')

7

In [39]:
# score of unigram
model.score('language') # P('language')

0.003916040100250626

In [40]:
# score of bigram
model.score('is', 'language'.split()) # P('is' | 'language')

0.44

In [41]:
# score of trigram
model.score('never', 'language is'.split()) # P('never' | 'language is')

0.6363636363636364

In [42]:
model.score('<UNK>') == model.score('lah')

True

In [43]:
model.score('<UNK>') == model.score('leh')

True

In [44]:
model.score("<UNK>") == model.score("lor")

True

In [45]:
# to prevent from underflow we use the logscore method
model.logscore('never', 'language is'.split())

-0.6520766965796932

## Generation using N-gram Language Model

In [46]:
# generate text
print(model.generate(20, random_seed=7))

['ate', 'inferences', 'are', 'drawn.', '2', '.', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']


In [47]:
# genreate some human-like text
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [48]:
generate_sent(model, 20, random_seed=7)

'ate inferences are drawn. 2.'

In [49]:
generate_sent(model, 20, random_seed=1)

'29⫺50. manning, christopher and hinrich schütze 1999 foundations of statistical independence.'

In [50]:
generate_sent(model, 20, random_seed=30)

'information between two rare events, the null hypthesis regarding the two objections, while both valid, are not'

In [51]:
generate_sent(model, 20, random_seed=42)

'not.'

## Saving the model
The native Python's pickle may not save the lambda functions in the model, so we can use the dill library in place of pickle to save and load the language model.

In [52]:
import dill as pickle
with open('kilgariff_ngram_model.pkl', 'wb') as fout:
    pickle.dump(model, fout)

In [53]:
with open('kilgariff_ngram_model.pkl', 'rb') as fin:
    model_loaded = pickle.load(fin)

In [54]:
generate_sent(model_loaded, 20, random_seed=42)

'not.'

## Try some generateing with Donald Trump data!!!  
Dataset: https://www.kaggle.com/kingburrito666/better-donald-trump-tweets#Donald-Tweets!.csv

In [55]:
import pandas as pd
df = pd.read_csv('Donald-Tweets!.csv')
df.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [57]:
trump_corpus = list(df['Tweet_Text'].apply(word_tokenize))

In [58]:
# Preprocess the tokenized text for 3-grams language modelling
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, trump_corpus)

In [59]:
from nltk.lm import MLE
trump_model = MLE(n) # Lets train a 3-grams model, previously we set n = 3
trump_model.fit(train_data, padded_sents)

In [60]:
generate_sent(trump_model, num_words=20, random_seed=42)

'do, and almost everyone of my beautiful mother, amazing father, @realDonaldTrump leading GOP in #Wisconsin.'

In [61]:
generate_sent(trump_model, num_words=10, random_seed=0)

'pretty sad situation. Go Jeb! He is heads above'

In [62]:
generate_sent(trump_model, num_words=50, random_seed=10)

'and hard for Hillary vs Bernie but do not like Ted Cruz would speak behind my back, just like our government! Lower taxes! https://t.co/ZwIkqNH2FX'

In [63]:
print(generate_sent(trump_model, num_words=100, random_seed=52))

with @DonaldJTrumpJr &amp; videos on the BORDER tomorrow. Will be in South Bend, Indiana! #Trump2016 #MakeAmericaGreatAgain
