In [1]:
import os
import nltk
import string
import random

from nltk.util import ngrams

In [2]:
FILE_PATH = 'corpus.txt'

stopwords = ['', '(', ')', '{', '}', '\\', '--', ':', '-', "'s"]
punc = string.punctuation + "``" + "''" + '"'

In [3]:
with open(FILE_PATH, 'r') as f:
    data = f.read().lower().replace('\n',' ')

In [4]:
len(data)

3954021

In [5]:
type(data)

str

In [6]:
def tokenized_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(words)
        
    return sents

In [7]:
def tokenized_rev_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(list(reversed(words)))
        
    return sents

In [8]:
sents = tokenized_words(data)
rev_sents = tokenized_rev_words(data)

In [9]:
len(sents)

40992

In [10]:
train_corpus = [word for sent in sents for word in sent]
rev_train_corpus = [word for sent in rev_sents for word in sent] 

In [11]:
len(train_corpus)

666665

In [12]:
def ngram_freq_dist(corpus, ngram=1):
    if isinstance(corpus, list) and len(corpus)>0:
        train_corpus=corpus
    elif type(corpus) is str:
        train_corpus=nltk.word_tokenize(corpus)
    else:
        print('Error')
        return None
    
    freq_dist=None
    if ngram==1:
        freq_dist = nltk.FreqDist(train_corpus) #freq distibution for unigrams
    elif ngram==2:
        freq_dist = nltk.ConditionalFreqDist(nltk.ngrams(train_corpus, 2))# conditional freq dist for bigrams
    elif ngram==3:
        trigrams_as_bigrams=[]
        trigram =[a for a in ngrams(train_corpus, 3)]
        trigrams_as_bigrams.extend([((t[0],t[1]), t[2]) for t in trigram])
        freq_dist = nltk.ConditionalFreqDist(trigrams_as_bigrams)# conditional freq dist for trigrams
    else:
        print('Supported upto trigrams only')
    return freq_dist

In [13]:
cfd_2gram = ngram_freq_dist(train_corpus, 2)
cfd_2gram_rev = ngram_freq_dist(rev_train_corpus, 2)

cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist)
cpd_2gram_rev = nltk.ConditionalProbDist(cfd_2gram_rev, nltk.MLEProbDist)

In [14]:
# function to generate random tweets

def generate_txt_bigram_model_random(cprob_2gram, cprob_2gram_rev, initialword, numwords=15):
    text = initialword
    suf_word = initialword
    pre_word = initialword
    for index in range(numwords):
        if random.random() > 0.5:
            try:
                suf_word = cprob_2gram[suf_word].generate()
                text = text + " " + suf_word
            except Exception as e:
                print('Can not generate the sentence')
                return
        else:
            try: 
                pre_word = cprob_2gram_rev[pre_word].generate()
                text = pre_word + ' ' + text
            except Exception as e:
                print('Can not generate the sentence')
                return
    return text

In [87]:
random_sentences = []
random_pos_tags = []
random_word_pos_tags = []

# Generate 200 sentences randomly 
for _ in range(5000):
    sent = generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'election', 9)
    word_pos_tags = nltk.pos_tag(sent.split())
    pos_tags = [x[1] for x in word_pos_tags]
    
    random_word_pos_tags.append(word_pos_tags)
    random_sentences.append(sent)
    random_pos_tags.append(pos_tags)

In [84]:
'''
RULES:

1. Determiner always comes before a noun.
2. Noun can be followed by another noun phrase.
3. Modals (could, will) can follow nouns.
4. ..

'''

pos_template_dict = {
    'NN': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'VBZ', 'NNS'],
    'NNS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NN'],
    'NNP': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NNS'],
    'NNPS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN'],
    'DT': ['NN', 'NNS', 'NNP', 'NNPS', 'VBP', 'JJ'],
    'JJ': ['CC'],
    'CC': ['NN', 'NNS', 'NNP', 'NNPS'],
    'VB': ['NN', 'DT', 'TO'],
    'VBD': ['NN', 'TO'],
    'VBG': ['IN', 'TO'],
    'VBP': ['VBG', 'RB', 'TO'],
    'VBN': ['RB', 'PRP', 'TO'],
    'VBZ': ['VBN'],
    'MD': ['VB', 'PRP'],
    'IN': ['DT', 'JJ'],
    'RB': ['NN', 'NNS'],
    'PRP': ['MD', 'VBD'],
    'TO': ['VB'],
}

In [85]:
# Write code to accept sentences which match the POS template
def filter_sentences(postags_list, sent_list, template):
    filtered_sent = []
    for ind, pos_tag in enumerate(postags_list):
        if is_pos_tag_match(pos_tag, template):
            filtered_sent.append(sent_list[ind])
            
    return filtered_sent
        
        
def is_pos_tag_match(tag, template):
    start = tag[0]
    if start in template:
        for t in tag[1:]:
            if t not in template[start]:
                return False
            else:
                start = t
        else:
            return True
    return False
    
def print_filtered_sent(filt_sent):
    for sent in filt_sent:
        print(sent)

<h1>Results</h1>

In [88]:
print_filtered_sent(filter_sentences(random_pos_tags, random_sentences, pos_template_dict))

than this election day dinner rt nytopinion estoy listo para
official i hope to create an election days excited to
a number of the election day of the rest of
sanders on this election must be a smartphone “ role
s must end election —hillary on the gourmet goat in
in this election day rt mccraylaurie martin luther king jr.
face on the basics of this election join the only
be a reminder of this election cycle than all make
address the people in this election results have to stand
the atrocity in the election day would provide food –
con kaine connects in this election of the fact mainstream
live in this election days grateful to caucus campaign stop
coming to move this election for the michigandems reception education
editorial in the lives for the election in a hillary
