In [1]:
import os
import nltk
import string
import random

from nltk.util import ngrams

In [2]:
FILE_PATH = 'corpus.txt'

stopwords = ['', '(', ')', '{', '}', '\\', '--', ':', '-', "'s"]
punc = string.punctuation + "``" + "''" + '"'

In [3]:
with open(FILE_PATH, 'r') as f:
    data = f.read().lower().replace('\n',' ')

In [4]:
len(data)

3954021

In [5]:
type(data)

str

In [6]:
def tokenized_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(words)
        
    return sents

In [7]:
def tokenized_rev_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(list(reversed(words)))
        
    return sents

In [8]:
sents = tokenized_words(data)
rev_sents = tokenized_rev_words(data)

In [9]:
len(sents)

40992

In [10]:
train_corpus = [word for sent in sents for word in sent]
rev_train_corpus = [word for sent in rev_sents for word in sent] 

In [11]:
len(train_corpus)

666665

In [12]:
def ngram_freq_dist(corpus, ngram=1):
    if isinstance(corpus, list) and len(corpus)>0:
        train_corpus=corpus
    elif type(corpus) is str:
        train_corpus=nltk.word_tokenize(corpus)
    else:
        print('Error')
        return None
    
    freq_dist=None
    if ngram==1:
        freq_dist = nltk.FreqDist(train_corpus) #freq distibution for unigrams
    elif ngram==2:
        freq_dist = nltk.ConditionalFreqDist(nltk.ngrams(train_corpus, 2))# conditional freq dist for bigrams
    elif ngram==3:
        trigrams_as_bigrams=[]
        trigram =[a for a in ngrams(train_corpus, 3)]
        trigrams_as_bigrams.extend([((t[0],t[1]), t[2]) for t in trigram])
        freq_dist = nltk.ConditionalFreqDist(trigrams_as_bigrams)# conditional freq dist for trigrams
    else:
        print('Supported upto trigrams only')
    return freq_dist

In [13]:
cfd_2gram = ngram_freq_dist(train_corpus, 2)
cfd_2gram_rev = ngram_freq_dist(rev_train_corpus, 2)

cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist)
cpd_2gram_rev = nltk.ConditionalProbDist(cfd_2gram_rev, nltk.MLEProbDist)

In [14]:
cfd_1gram = ngram_freq_dist(train_corpus)
cpd_1gram = nltk.MLEProbDist(cfd_1gram)

In [15]:
# function to generate random tweets

def generate_txt_bigram_model_random(cprob_2gram, cprob_2gram_rev, initialword, numwords=15):
    text = initialword
    suf_word = initialword
    pre_word = initialword
    for index in range(numwords):
        if random.random() > 0.5:
            try:
                suf_word = cprob_2gram[suf_word].generate()
                text = text + " " + suf_word
            except Exception as e:
                print('Can not generate the sentence')
                return
        else:
            try: 
                pre_word = cprob_2gram_rev[pre_word].generate()
                text = pre_word + ' ' + text
            except Exception as e:
                print('Can not generate the sentence')
                return
    return text

In [16]:
random_sentences = []
random_pos_tags = []
random_word_pos_tags = []

# Generate 5000 sentences randomly 
for _ in range(5000):
    sent = generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'election', 9)
    word_pos_tags = nltk.pos_tag(sent.split())
    pos_tags = [x[1] for x in word_pos_tags]
    
    random_word_pos_tags.append(word_pos_tags)
    random_sentences.append(sent)
    random_pos_tags.append(pos_tags)

In [17]:
'''
RULES:

1. Determiner always comes before a noun.
2. Noun can be followed by another noun phrase.
3. Modals (could, will) can follow nouns.
4. ..

'''

pos_template_dict = {
    'NN': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'VBZ', 'NNS'],
    'NNS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NN'],
    'NNP': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NNS'],
    'NNPS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN'],
    'DT': ['NN', 'NNS', 'NNP', 'NNPS', 'VBP', 'JJ'],
    'JJ': ['CC'],
    'CC': ['NN', 'NNS', 'NNP', 'NNPS'],
    'VB': ['NN', 'DT', 'TO'],
    'VBD': ['NN', 'TO'],
    'VBG': ['IN', 'TO'],
    'VBP': ['VBG', 'RB', 'TO'],
    'VBN': ['RB', 'PRP', 'TO'],
    'VBZ': ['VBN'],
    'MD': ['VB', 'PRP'],
    'IN': ['DT', 'JJ'],
    'RB': ['NN', 'NNS'],
    'PRP': ['MD', 'VBD'],
    'TO': ['VB'],
}

In [18]:
# Write code to accept sentences which match the POS template
def filter_sentences(postags_list, sent_list, template):
    filtered_sent = []
    for ind, pos_tag in enumerate(postags_list):
        if is_pos_tag_match(pos_tag, template):
            filtered_sent.append(sent_list[ind])
            
    return filtered_sent
        
        
def is_pos_tag_match(tag, template):
    start = tag[0]
    if start in template:
        for t in tag[1:]:
            if t not in template[start]:
                return False
            else:
                start = t
        else:
            return True
    return False
    
def print_filtered_sent(filt_sent):
    for sent in filt_sent:
        print(sent)

<h1>Results</h1>

In [19]:
filtered_sent = filter_sentences(random_pos_tags, random_sentences, pos_template_dict)
print_filtered_sent(filtered_sent)

at the discipline temperament knowledge on this election day …
zone in this election day rt businessinsider americans age of
this election cycle … i got to look fact did
earn the people like this election cycle of the greed
this afternoon phone bank on this election days should be
confirmation process as a party leadership in this election is
government must end election in early and share yours have
force against this election days left to be a trump
a country can fix this election will make the us-mexico
to solve this election join forces that some questions at
this election days left to run interference from the wealthy


In [20]:
def sent_prob(sent_list, cpd_1gram, cpd_2gram):
    sent_prob_dict = {}
    for sent in sent_list:
        total_prob = 1.0
        words = nltk.word_tokenize(sent)
        total_prob = cpd_1gram.prob(words[0])
        for w1, w2 in nltk.ngrams(words, 2):
            total_prob *= cpd_2gram[w1].prob(w2)
        sent_prob_dict[sent] = total_prob
    return sent_prob_dict

def get_top_five(dict_of_probs):
    sorted_dict = sorted(dict_of_probs.items(), key=lambda x: -x[1])
    top_five_sent = [sent for sent, prob in sorted_dict[:5]]
    return top_five_sent

In [21]:
dict_of_probs = sent_prob(filtered_sent, cpd_1gram, cpd_2gram)
dict_of_probs

{'at the discipline temperament knowledge on this election day …': 4.795949483785998e-16,
 'zone in this election day rt businessinsider americans age of': 2.471330382291613e-19,
 'this election cycle … i got to look fact did': 1.1045191043819975e-18,
 'earn the people like this election cycle of the greed': 2.5495279556177294e-18,
 'this afternoon phone bank on this election days should be': 1.5494765107522896e-16,
 'confirmation process as a party leadership in this election is': 1.2631291616645124e-18,
 'government must end election in early and share yours have': 0.0,
 'force against this election days left to be a trump': 9.123987574441706e-18,
 'a country can fix this election will make the us-mexico': 8.327649111661496e-20,
 'to solve this election join forces that some questions at': 1.5726586448343183e-19,
 'this election days left to run interference from the wealthy': 5.614198440014693e-16}

In [23]:
top_five = get_top_five(dict_of_probs)

## Top 5 tweets

In [24]:
for tweet in top_five:
    print(tweet)
    print('=============')

this election days left to run interference from the wealthy
at the discipline temperament knowledge on this election day …
this afternoon phone bank on this election days should be
force against this election days left to be a trump
earn the people like this election cycle of the greed
