In [2]:
import os
import nltk
import string
import random

from nltk.util import ngrams

In [3]:
FILE_PATH = '../corpus.txt'

stopwords = ['', '(', ')', '{', '}', '\\', '--', ':', '-', "'s"]
punc = string.punctuation + "``" + "''" + '"'

In [4]:
with open(FILE_PATH, 'r') as f:
    data = f.read().lower().replace('\n',' ')

In [5]:
len(data)

3954021

In [6]:
type(data)

str

In [7]:
def tokenized_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(words)
        
    return sents

In [8]:
def tokenized_rev_words(data):
    sents = []
    for sent in nltk.sent_tokenize(data):
        words = [word for word in nltk.word_tokenize(sent) if word not in stopwords and word not in punc]
        sents.append(list(reversed(words)))
        
    return sents

In [9]:
sents = tokenized_words(data)
rev_sents = tokenized_rev_words(data)

In [10]:
len(sents)

40992

In [11]:
train_corpus = [word for sent in sents for word in sent]
rev_train_corpus = [word for sent in rev_sents for word in sent] 

In [12]:
len(train_corpus)

666665

In [13]:
def ngram_freq_dist(corpus, ngram=1):
    if isinstance(corpus, list) and len(corpus)>0:
        train_corpus=corpus
    elif type(corpus) is str:
        train_corpus=nltk.word_tokenize(corpus)
    else:
        print('Error')
        return None
    
    freq_dist=None
    if ngram==1:
        freq_dist = nltk.FreqDist(train_corpus) #freq distibution for unigrams
    elif ngram==2:
        freq_dist = nltk.ConditionalFreqDist(nltk.ngrams(train_corpus, 2))# conditional freq dist for bigrams
    elif ngram==3:
        trigrams_as_bigrams=[]
        trigram =[a for a in ngrams(train_corpus, 3)]
        trigrams_as_bigrams.extend([((t[0],t[1]), t[2]) for t in trigram])
        freq_dist = nltk.ConditionalFreqDist(trigrams_as_bigrams)# conditional freq dist for trigrams
    else:
        print('Supported upto trigrams only')
    return freq_dist

In [14]:
cfd_2gram = ngram_freq_dist(train_corpus, 2)
# cfd_2gram_rev = ngram_freq_dist(rev_train_corpus, 2)

cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist)
# cpd_2gram_rev = nltk.ConditionalProbDist(cfd_2gram_rev, nltk.MLEProbDist)

In [16]:
def generate_txt_bigram_model_random(cprob_2gram, cprob_2gram_rev, initialword, numwords=15):
    text = initialword
    suf_word = initialword
    pre_word = initialword
    for index in range(numwords):
        if random.random() > 0.5:
            try:
                suf_word = cprob_2gram[suf_word].generate()
                text = text + " " + suf_word
            except Exception as e:
                print('Can not generate the sentence')
                return
        else:
            try: 
                pre_word = cprob_2gram_rev[pre_word].generate()
                text = pre_word + ' ' + text
            except Exception as e:
                print('Can not generate the sentence')
                return
    return text

In [1]:
def pos_tagging(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return doc

In [4]:
nltk.pos_tag(['he', 'is', 'coming', 'here'])

[('he', 'PRP'), ('is', 'VBZ'), ('coming', 'VBG'), ('here', 'RB')]

In [None]:
pos_template_dict = {
    'START': [['NP', 'VP'], ['Aux', 'NP', 'VP'], ['VP']],
    'NP': [['Det', 'NOM']],
    'NOM': [['Noun', 'NOM']],
    'VP': [['Verb'], ['Verb', 'NP']],
}

In [19]:
sent = generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'election')

In [20]:
nltk.pos_tag(nltk.word_tokenize(sent))

[('citizens', 'NNS'),
 ('in', 'IN'),
 ('life', 'NN'),
 ('of', 'IN'),
 ('us', 'PRP'),
 ('election', 'NN'),
 ('let', 'VB'),
 ('me', 'PRP'),
 ('to', 'TO'),
 ('ride', 'VB'),
 ('away', 'RB'),
 ('from', 'IN'),
 ('politics', 'NNS'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('class', 'NN')]

<h1>Results</h1>

In [19]:
for _ in range(5):
    print(generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev, 'education'))
    print('----------------------------------------')

sounded alike today congrats to serve should n't let celebrate education should be right to mark
----------------------------------------
//t.… seattle see this gmo labels on education equal pay gap he bankrupted his own employees
----------------------------------------
a racially biased practice according to be higher education not to close to the fda who
----------------------------------------
themselves w/o affecting their fair learn more affordable higher education we ’ ll build on the
----------------------------------------
most major country on fossil fuels in the education not adjourn w/o an excellent choice rt
----------------------------------------
