In [2]:
import random
import nltk
import spacy
import re


In [3]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/ancav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Using Markov chains
- calculate the probability of transitioning from one word to another based on unique words in a corpus (text speeches Trump in our case)
- once all the probabilities are fed, we can feed it a word from the corpus until we tell it to stop
- in order for the sentences to be gramatically correct or sensical, we should not use the "memoryless" property: the only context for a word is based on a list of previous words
- if the sequences on which we base the state are longer, we overfit the data
- add back-off which tells the generator to start with a sequence of x words and check if the size of the bag of possivle next words to that sequence is larger than a value y; if the requirement isn't met, the model falls back on a shorter sentence consisting of the last x-1 words form the original sentence

CAVEAT - not all the generated text will make sense

In [4]:
class Markov(object):
    def __init__(self, corpus, n_grams, min_length):
        """
        corpus = list of string text
        n_grams = max sequence length
        min_length = minimum number of next words required for back-off to the markov text generator """

        self.grams = {}
        self.n_grams = n_grams
        self.corpus = corpus
        self.min_length = min_length
        self.sequences()
    
    def tokenize_text(self, text, gram):
        """tokenize the speeches in the corpus and split them on the number of grams desired"""
        
        tokenized_speech = nltk.word_tokenize(text)
        
        if len(tokenized_speech) < gram:
            pass
        else:
            for i in range(len(tokenized_speech) - gram):
                yield (tokenized_speech[i:i + (gram +1)])
                
    def sequences(self):
        """create all the sequences of length up to n_grams"""
        
        for gram in range(1, self.n_grams + 1):
            dictionary = {}
            for speech in self.corpus:
                for sequence in self.tokenize_text(speech, gram):
                    key_id = tuple(sequence[0:-1])
                    
                    # check if the key is in the dictionary
                    if key_id in dictionary.keys():
                        dictionary[key_id].append(sequence[gram])
                    else:
                        dictionary[key_id] = [sequence[gram]]
            self.grams[gram] = dictionary
            
    def next_word(self, key_id):
        """return the next word for an input sequence but back off to shorter sequence if length requirement
        is not met"""
        for i in range(len(key_id)):
            try:
                if len(self.grams[len(key_id)][key_id]) >= self.min_length:
                    return random.choice(self.grams[len(key_id)][key_id])
            except KeyError:
                pass
            
        # shrink the key_id if the requirement is not met
        if len(key_id)>1:
            key_id == key_id[1:]
            
        try:
            return random.choice(self.grams[len(key_id)][key_id])
        except KeyError:
            # key does not exist, choose next word at random
            return random.choice(" ".join(self.corpus).split())
    
    def next_key(self, key_id, res):
        return tuple(key_id[1:]) + tuple([res])
    
    def generate_text(self, start, size = 6):
        """the start is a group of words of at least n_grams words"""
        key_id = tuple(nltk.word_tokenize(start))[-self.n_grams:]
        gen_words = []
        i = 0
        while i <= size:
            result = self.next_word(key_id)
            key_id = self.next_key(key_id, result)
            gen_words.append(result)
            i+=1
        print(start + " " + " ".join(gen_words).replace(" .", ".").replace(" ,", ","))
        
        

In [5]:
file = open("trump_speech_hillary.txt", "r")
f = file.read()
corpus = f.split()

In [22]:
mark = Markov(corpus, 2, 2)
mark.generate_text("Hillary Clinton", size = 10)

Hillary Clinton President in to rules capabilities the now time was who in


In [23]:
import markovify
from time import time
import gc

In [24]:
start_time = time()
com_generator = markovify.Text(corpus, state_size = 2)
print("Run time for training the generator : {} seconds".format(round(time()-start_time, 2)))

Run time for training the generator : 0.05 seconds


In [25]:
# Print randomly-generated comments using the built model
def generate_comments(generator, number=10, short=False):
    count = 0
    while count < number:
        if short:
            comment = generator.make_short_sentence(90)
        else:
            comment = generator.make_sentence()
        if comment:
            count += 1
            print("Comment {}".format(count))
            print(comment)
            print()

In [26]:
generate_comments(com_generator)

KeyboardInterrupt: 

In [None]:
class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ["::".join((word.orth_, word.pos_)) for word in nlp(sentence)]

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

In [None]:
start_time = time()
comments_generator_POSified = POSifiedText(tweets, state_size = 2)
print("Run time for training the generator : {} seconds".format(round(time()-start_time, 2)))