In [1]:
import os
import nltk
import string
import random

from nltk.util import ngrams

In [2]:
FILE_PATH = 'corpus.txt'

stopwords = ['', '(', ')', '{', '}', '\\', '--', ':', '-', "'s"]
punc = string.punctuation + "``" + "''" + '"'

In [3]:
with open(FILE_PATH, 'r') as f:
    data = f.read().lower().replace('\n',' ')

In [4]:
len(data)

3954021

In [5]:
type(data)

str

In [6]:
def tokenized_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(words)
        
    return sents

In [7]:
def tokenized_rev_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(list(reversed(words)))
        
    return sents

In [8]:
sents = tokenized_words(data)
rev_sents = tokenized_rev_words(data)

In [9]:
len(sents)

40992

In [10]:
train_corpus = [word for sent in sents for word in sent]
rev_train_corpus = [word for sent in rev_sents for word in sent] 

In [11]:
len(train_corpus)

666665

In [12]:
def ngram_freq_dist(corpus, ngram=1):
    if isinstance(corpus, list) and len(corpus)>0:
        train_corpus=corpus
    elif type(corpus) is str:
        train_corpus=nltk.word_tokenize(corpus)
    else:
        print('Error')
        return None
    
    freq_dist=None
    if ngram==1:
        freq_dist = nltk.FreqDist(train_corpus) #freq distibution for unigrams
    elif ngram==2:
        freq_dist = nltk.ConditionalFreqDist(nltk.ngrams(train_corpus, 2))# conditional freq dist for bigrams
    elif ngram==3:
        trigrams_as_bigrams=[]
        trigram =[a for a in ngrams(train_corpus, 3)]
        trigrams_as_bigrams.extend([((t[0],t[1]), t[2]) for t in trigram])
        freq_dist = nltk.ConditionalFreqDist(trigrams_as_bigrams)# conditional freq dist for trigrams
    else:
        print('Supported upto trigrams only')
    return freq_dist

In [13]:
cfd_2gram = ngram_freq_dist(train_corpus, 2)
cfd_2gram_rev = ngram_freq_dist(rev_train_corpus, 2)

cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist)
cpd_2gram_rev = nltk.ConditionalProbDist(cfd_2gram_rev, nltk.MLEProbDist)

In [14]:
# function to generate random tweets

def generate_txt_bigram_model_random(cprob_2gram, cprob_2gram_rev, initialword, numwords=15):
    text = initialword
    suf_word = initialword
    pre_word = initialword
    for index in range(numwords):
        if random.random() > 0.5:
            try:
                suf_word = cprob_2gram[suf_word].generate()
                text = text + " " + suf_word
            except Exception as e:
                print('Can not generate the sentence')
                return
        else:
            try: 
                pre_word = cprob_2gram_rev[pre_word].generate()
                text = pre_word + ' ' + text
            except Exception as e:
                print('Can not generate the sentence')
                return
    return text

In [4]:
nltk.pos_tag(['he', 'is', 'coming', 'here'])

[('he', 'PRP'), ('is', 'VBZ'), ('coming', 'VBG'), ('here', 'RB')]

In [19]:
sent = generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'election')

In [20]:
nltk.pos_tag(nltk.word_tokenize(sent))

[('citizens', 'NNS'),
 ('in', 'IN'),
 ('life', 'NN'),
 ('of', 'IN'),
 ('us', 'PRP'),
 ('election', 'NN'),
 ('let', 'VB'),
 ('me', 'PRP'),
 ('to', 'TO'),
 ('ride', 'VB'),
 ('away', 'RB'),
 ('from', 'IN'),
 ('politics', 'NNS'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('class', 'NN')]

In [None]:
pos_template_dict = {
    'V': [['NP', 'VP'], ['Aux', 'NP', 'VP'], ['VP']],
    'NP': [['Det', 'NOM']],
    'NOM': [['Noun', 'NOM']],
    'VP': [['Verb'], ['Verb', 'NP']],
}

In [15]:
random_sentences = []
random_pos_tags = []
random_word_pos_tags = []

# Generate 200 sentences randomly 
for _ in range(200):
    sent = generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'election', 9)
    word_pos_tags = nltk.pos_tag(sent.split())
    pos_tags = [x[1] for x in word_pos_tags]
    
    random_word_pos_tags.append(word_pos_tags)
    random_sentences.append(sent)
    random_pos_tags.append(pos_tags)

In [16]:
random_sentences[:5]

['stop until election just over the federal funding bill to',
 'clear fracking secretary general election is made a republican politicians',
 "republicans ca today election cycle joseiswriting rt somersworthdems governor o'malley",
 'better than any election day should be it easier to',
 'come and make this election the same instead she spent']

In [17]:
random_pos_tags[:5]

[['NN', 'IN', 'NN', 'RB', 'IN', 'DT', 'JJ', 'NN', 'NN', 'TO'],
 ['JJ', 'NN', 'NN', 'JJ', 'NN', 'VBZ', 'VBN', 'DT', 'JJ', 'NNS'],
 ['NNS', 'MD', 'NN', 'NN', 'NN', 'VBG', 'JJ', 'NNS', 'VBP', 'NN'],
 ['RBR', 'IN', 'DT', 'NN', 'NN', 'MD', 'VB', 'PRP', 'JJR', 'TO'],
 ['NN', 'CC', 'VB', 'DT', 'NN', 'DT', 'JJ', 'RB', 'PRP', 'VBD']]

In [18]:
random_word_pos_tags[:5]

[[('stop', 'NN'),
  ('until', 'IN'),
  ('election', 'NN'),
  ('just', 'RB'),
  ('over', 'IN'),
  ('the', 'DT'),
  ('federal', 'JJ'),
  ('funding', 'NN'),
  ('bill', 'NN'),
  ('to', 'TO')],
 [('clear', 'JJ'),
  ('fracking', 'NN'),
  ('secretary', 'NN'),
  ('general', 'JJ'),
  ('election', 'NN'),
  ('is', 'VBZ'),
  ('made', 'VBN'),
  ('a', 'DT'),
  ('republican', 'JJ'),
  ('politicians', 'NNS')],
 [('republicans', 'NNS'),
  ('ca', 'MD'),
  ('today', 'NN'),
  ('election', 'NN'),
  ('cycle', 'NN'),
  ('joseiswriting', 'VBG'),
  ('rt', 'JJ'),
  ('somersworthdems', 'NNS'),
  ('governor', 'VBP'),
  ("o'malley", 'NN')],
 [('better', 'RBR'),
  ('than', 'IN'),
  ('any', 'DT'),
  ('election', 'NN'),
  ('day', 'NN'),
  ('should', 'MD'),
  ('be', 'VB'),
  ('it', 'PRP'),
  ('easier', 'JJR'),
  ('to', 'TO')],
 [('come', 'NN'),
  ('and', 'CC'),
  ('make', 'VB'),
  ('this', 'DT'),
  ('election', 'NN'),
  ('the', 'DT'),
  ('same', 'JJ'),
  ('instead', 'RB'),
  ('she', 'PRP'),
  ('spent', 'VBD')]]

<h1>Results</h1>

In [19]:
for _ in range(5):
    print(generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'education'))
    print('----------------------------------------')

sounded alike today congrats to serve should n't let celebrate education should be right to mark
----------------------------------------
//t.… seattle see this gmo labels on education equal pay gap he bankrupted his own employees
----------------------------------------
a racially biased practice according to be higher education not to close to the fda who
----------------------------------------
themselves w/o affecting their fair learn more affordable higher education we ’ ll build on the
----------------------------------------
most major country on fossil fuels in the education not adjourn w/o an excellent choice rt
----------------------------------------
