# Fulfillomatic

##### Adriana Souza, Roger Filmyer

![NLG](http://www.pngall.com/wp-content/uploads/2016/07/Meditation-Transparent.png)

### Loading data

In [1]:
# Packages
import numpy as np
import nltk
import random
import string

from collections import defaultdict

In [2]:
# Selecting the file to use
#file = 'training/inspirational_quotes.txt'
#file = 'training/nietzsche_quotes.txt'
file = 'training/zen_quotes.txt'
#file = 'training/everything.txt'  # Worse quote results due to different styles being mixed together.

# Storing quotes from file in a list
with open(file, encoding='utf-8') as opened_file: 
    lists = opened_file.read().splitlines()
    quotes = []
    for line in lists:
        quotes.append(line)

***

## Version 0: Unweighted Parts-of-Speech

To start, we tried...

In [3]:
# Tokenize
tokenized_corpus = []
for quote in quotes:
    tokenized_quote = nltk.tokenize.word_tokenize(quote)
    tagged_quote = nltk.pos_tag(tokenized_quote)
    tokenized_corpus.append(tagged_quote)

# Set up the language "model"
parts_of_speech = defaultdict(list)
sentence_structures = []
for quote in tokenized_corpus:
    sentence_structure = []
    for word, pos in quote:
        parts_of_speech[pos].append(word)
        sentence_structure.append(pos)
    sentence_structures.append(sentence_structure)

# Generate an example sentence
# get_mindful_v0()
def chaos():
    """
    Generate an inspirational sentence. 
    
    Ensure that you are in the proper state of mind before running. ॐ
    """
    sentence_skeleton = random.choice(sentence_structures)
    reconstituted_sentence = []
    for part_of_speech in sentence_skeleton:
        new_word = random.choice(parts_of_speech[part_of_speech])
        reconstituted_sentence.append(new_word)
    return " ".join(reconstituted_sentence)

In [4]:
# Output
chaos()

'Zen between goal ?'

### Version 0 -- Chaos -- results

* your ready Speak begins when you can hear you not and never .
* in I think busy forwards of coffee , forever it . in you will live aware library you , make your education .
* the poison of purpose is to see nowhere a interesting majority because roads , and your able atom .
* without t denies my anything that bulk , yourself can once call You .
* as I are dreams to don grief , never the sun is to tolerate able you .
* all valuable choice is than the painful comfort , it can keep imprisoned believe only not that you ’ you .

## Next:

We see we need to do a lot of things, most of which we should've done even before we started (like lowercasing, removing punctuation, taking care of contractions). It seems that just assuming words would have a uniform distribution if we know the input is some sort of "quote"-esque type sentence wasn't enough. Since we kept our quotes separate and they aren't particularly long sentences, let's start with a bigram model.

***

## Version 1: Bigram Model

Well, that worked great. Maybe some context _would_ be good.

In [5]:
# Turning list into string
corpus = ""
for word in quotes:
    # Lowercasing
    word = word.lower()
    
    # Adding end tokens to mark the end of quotes
    word = word.replace('.', ' END ')   
    
    # Remove punctuation
    table = str.maketrans('','', string.punctuation + '…”“–')      
    word = word.translate(table)
    
    # Adding cleaned text to corpus
    corpus = corpus + word  

# Tokenizing
def tokenize(input_string):
    return input_string.split()

# Getting bigram model
def get_bigrams(corpus):
    corpus_fd_unigram = nltk.FreqDist(tokenize(corpus))
    bigrams = nltk.bigrams(['END'] + tokenize(corpus))
    bigrams_fd = nltk.FreqDist(bigrams)
    results = {}
    for bigram, bigram_frequency in bigrams_fd.items():
        first_word, second_word = bigram
        probability = (bigram_frequency / corpus_fd_unigram[first_word])    
        results[bigram] = probability
    return results

bigram_model = get_bigrams(corpus)

## New version 

Below, we use a bigram model and also take some care in structuring how the sentence will come out. We make sure that our quote starts with a bigram of the form `[END, word]` and ends with a bigram of the form `[word, END]`. 

In [6]:
# Creating function to get an n-gram model
def get_sentence_with_ngram_model(num_words, model):
    words_in_sentence = ['END' for i in range(0, num_words - 1)] # Pad the start of the sentence with 'END' tokens
    final_word = None
    
    while final_word != 'END':        
        initial_n_gram_words = words_in_sentence[-(num_words - 1):]
        matching_n_gram_keys = []
        
        #Get probabilites
        for n_gram in model.keys():
            words_to_match = zip(n_gram, initial_n_gram_words)
            if all(a == b for a, b in words_to_match):
                matching_n_gram_keys.append(n_gram) 
                
        # Pick probabilities        
        n_gram_probabilities = [model[n_gram] for n_gram in matching_n_gram_keys]        
        total_probability = sum(n_gram_probabilities)                
        final_word = np.random.choice(
                        a=[n_gram[-1] for n_gram in matching_n_gram_keys],
                        p=[p for p in n_gram_probabilities])
        words_in_sentence.append(final_word)
        
    words_in_sentence = words_in_sentence[(num_words - 1): -1]
    
    # Capitalize first letter of first word
    if len(words_in_sentence) > 0:
        first_word = words_in_sentence[0]
        first_word = first_word[0].upper() + first_word[1:]
        words_in_sentence[0] = first_word
        sentence = " ".join(words_in_sentence) + '.'
    else:
        sentence = get_sentence_with_ngram_model(num_words, model)
    return sentence

Let's try it with a bigram model:

In [7]:
# Version 1 of Fulfillomatic
def duality():    
    """
    You must only concentrate on the next step, the next breath, 
    the next stroke of the broom, and the next, and the next. Nothing else.
    ॐ
    
    (Bigram Model)
    """    
    sentence = ""
    while len(sentence.split()) < 4:
        sentence = get_sentence_with_ngram_model(2, bigram_model)
    return sentence

In [8]:
# Creating a function that will print a desired number of generated quotes
def repeat(times, f):
    for i in range(times): f()
    
def do_v1():
    print(duality())

# Printing 5 generated quotes
repeat(5, do_v1)

Life as long enough.
Any concrete reality you eat.
Time is happy just sit.
Knowing nothing within our mind should not a man seeks is not judge you have many attachment you desired.
This scripture throw it.


In [65]:
def do_v0():
    print(chaos())

# Printing 5 generated quotes
repeat(5, do_v0)

swinging is the best one of intelligence .
that Nobody of you look other , next cravings will go You .
no nothing because the crazy dwell everyday , soft except the fishermen between the man , long to be , to go , or let to all the teaches .
of yourself should see the your move ; distraction , glimpsing , trusting and coming down - mind to Do the first rules between its diminish anything , you will try few of not letting all death .
There becomes eventually beautiful to happen . You not use to see the distinctions .


### Version 1 results

* Just do it.
* In my friends you can get the fire you grow from it should scare you do drunk.
* You.
* I believe in the least for anything i believe in god from a man to exist.
* Dont bother just take rest is too little one that you better.
* If you can not what we know what you will remain constant.
* What we are travelling more difficult than to forget is no greatness.
* Anything you look for what you do not being yourself.
* Let the wilderness of all else is still looking for us entirely happy because i told dismiss that can do something.

***

## Version 2: Trigram Model

It's... marginally better. Our ratio of "potentially good" generated quotes to "gibberish quotes" is still pretty awful. Let's see how a trigram model does instead.

In the steps above, we took some risks with our tokens. Since we ended up turning our corpus back into a long string instead of a list, now we just have quotes after quotes that aren't necessarily related. This is a problem because we don't necessarily want trigrams that span from the end of one quote to the next. Those trigrams do not represent tokens that could follow each other in a text -- they are completely accidental.

To address this, we added double end tokens for the trigrams: now, starting tokens look like `[END, END, word]` and end tokens like `[word, END, END]`.

In [9]:
# Adding extra END tokens
def add_extra_end_token(tokenized_document):
    new_document = []
    for token in tokenized_document:
        new_document.append(token)
        if token == "END":
            new_document.append("END")
    return new_document

def get_trigrams(document):
    corpus = tokenize(document)
    corpus = add_extra_end_token(corpus)
    corpus_fd_bigram = nltk.FreqDist(nltk.bigrams(["END"] + corpus))
    trigrams = nltk.trigrams(["END", "END"] + corpus)
    trigrams_fd = nltk.FreqDist(trigrams)
    results = {}
    for trigram, trigram_frequency in trigrams_fd.items():
        first_word, second_word, third_word = trigram
        probability = (trigram_frequency) / (corpus_fd_bigram[(first_word, second_word)])
        results[trigram] = probability
    return results

#get_trigrams(corpus)

trigram_model = get_trigrams(corpus)

We modified `get_mindful_v1` to be able to work with an N-gram model below, and `get_mindful_v2` is born:

In [10]:
# Get mindful with Fulfillomatic version 3
def open_your_third_eye():
    """
    Three things cannot long be hidden: the sun, the moon, and the truth. ॐ
    
    (Trigram Model)
    """
    sentence = ""
    while len(sentence.split()) < 4:
        sentence = get_sentence_with_ngram_model(3, trigram_model)
    return sentence

Let's generate some examples:

In [11]:
# Print 5 generated sentences
def do_v2():
    print(open_your_third_eye())
    
repeat(15,do_v2)

To see things for what they see not what they see not what they think the wise move through the world is first in ones own heart.
The only zen you can understand the mind we can change our reality by changing our mind is utterly empty.
A mind that prevents evil from arising in either.
One must transcend techniques so that the young men of today are so contriving and so proud of their material possessions.
If in our heart we still cling to anything do not exist for the sake of ourselves.
If you chase two rabbits you catch none.
Peace of mind rather than attacking the evil that is not crazy enough.
That is within yourself rather than attacking the evil that is what it is about your own sense of confinement.
An awakened person is someone who finds freedom in good fortune and fame.
Zen is an idle person.
Not thinking but not dreaming.
A fool because he has something to show for it.
That is not a raging waterfall.
Our task must be deeply aware of the broom and the butterflies will come.
It 

***

### Example: "It takes courage **to grow** sharper."

Take: *"The world is full of magic things, patiently waiting for our senses* **to grow** *sharper."*

And: *"It takes courage* **to grow** *up and become who you really are."*

Get: It takes courage **to grow** sharper.



## What if we feed the model a bunch of Nietzsche quotes?

* Without music life would be a means to conceal oneself.
* The noble soul reveres itself.
* What is the struggle of opinions that is to preserve the distance which separates us from other men.
* God is a rope over an abyss.
* But there is also always some reason in madness.
* We have forgotten are illusions.
* Christianity is our taste no longer our reasons.
* The end of a bad memory is too good.
* The advantage of a strong faith is infallible.
* There are two different types of people in the enemy’s staying alive.

## What if we feed the model a bunch of Zen quotes?

* 
* 

In [12]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from pickle import dump
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Using TensorFlow backend.


In [13]:
import h5py
import keras

In [48]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# load document
in_filename = 'training/inspirational_quotes.txt'
doc = load_doc(in_filename)
print(doc[:200])

# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'inspirational_sequences.txt'
save_doc(sequences, out_filename)

Begin now to be what you will be hereafter.
We have to do the best we are capable of. This is our sacred human responsibility.
You have as much laughter as you have faith.
The brain is wider than the 
['begin', 'now', 'to', 'be', 'what', 'you', 'will', 'be', 'hereafter', 'we', 'have', 'to', 'do', 'the', 'best', 'we', 'are', 'capable', 'of', 'this', 'is', 'our', 'sacred', 'human', 'responsibility', 'you', 'have', 'as', 'much', 'laughter', 'as', 'you', 'have', 'faith', 'the', 'brain', 'is', 'wider', 'than', 'the', 'sky', 'two', 'roads', 'diverged', 'in', 'a', 'wood', 'and', 'i', 'took', 'the', 'one', 'less', 'traveled', 'by', 'and', 'that', 'has', 'made', 'all', 'the', 'difference', 'a', 'goal', 'should', 'scare', 'you', 'a', 'little', 'and', 'excite', 'you', 'a', 'lot', 'listen', 'smile', 'agree', 'and', 'then', 'do', 'whatever', 'the', 'fuck', 'you', 'were', 'doing', 'to', 'do', 'anyway', 'magic', 'is', 'believing', 'in', 'yourself', 'if', 'you', 'can', 'do', 'that', 'you', 'can', 'mak

In [49]:
# load
in_filename = 'inspirational_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            115950    
_________________________________________________________________
lstm_5 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_6 (Dense)              (None, 2319)              234219    
Total params: 501,069
Trainable params: 501,069
Non-trainable params: 0
_________________________________________________________________
None


In [51]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20f63117048>

In [52]:
# from importlib import reload
# reload(keras.models)

# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [58]:
# load cleaned text sequences
in_filename = 'inspirational_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

encoded = tokenizer.texts_to_sequences([seed_text])

failure nothing really worth having comes quickly and easily if it did i doubt that we would ever grow problems are only opportunities in work clothes wishes are like seeds few ever develop into something nothing can withstand the power of the human will if it is willing to stake its

failure nothing really worth having comes quickly and easily if it did i doubt that we would ever grow problems are only opportunities in work clothes wishes are like seeds few ever develop into something nothing can withstand the power of the human will if it is willing to stake its



In [59]:
np.array(encoded)[:, 1:]

array([[  47,  144,  257,  235,  486,  924,    6, 1812,   22,    9,  664,
          17,  274,   12,   21,  152,  126,  138, 1813,   16,   33,  642,
          11,   91, 1814,  484,   16,   68,  674,  353,  126,  464,  147,
          93,   47,   15, 1815,    1,  163,    7,    1,  165,   10,   22,
           9,    4,  176,    3, 1816,   98]])

In [60]:
# predict probabilities for each word
yhat = model.predict_classes(np.array(encoded)[:, 1:], verbose=0)

In [61]:
out_word = ''
for word, index in tokenizer.word_index.items():
    if index == yhat:
        out_word = word
        break
        
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

In [64]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
                # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 30)
print(generated)

be the life of the most life and the world of the most life and the world of the most life and the world of the most life and the


![NLG](https://supportivedivorcesolutions.com/wp-content/uploads/2017/03/iStock-468140568.jpg)