In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' ## might have to comment this out, gpu related
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import layers
from keras.models import Sequential
import keras.utils as ku
from keras.callbacks import EarlyStopping
#TF_FORCE_GPU_ALLOW_GROWTH=True
#import tensorflow as tf


In [2]:
data = pd.read_csv('quotesFiltered.csv', sep=';')
data = data.drop(data[data.QUOTE.str.count("\.") > 1].index) ## remove quotes with more than 1 sentence by counting dots
data = data['QUOTE'].str.lower() ##makes all strings lowercase
quotes = data.drop_duplicates()
print(f"Total Unique Quotes: {quotes.shape}")


all_quotes = list(quotes)

Total Unique Quotes: (36197,)


In [3]:
tokenizer = Tokenizer()

def generate_sequences(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    print(f"Total unique words in the text corpus: {total_words}")
    input_sequences = []
    for line in corpus:
        seq = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(seq)):
            ngram_seq = seq[:i+1]
            input_sequences.append(ngram_seq)
            
    return input_sequences, total_words

# Generating sequences
input_sequences, total_words = generate_sequences(all_quotes)
input_sequences[:5]
maxlen = max([len(x) for x in input_sequences])
print(maxlen)

Total unique words in the text corpus: 24212
81


In [4]:
# Generating predictors and labels from the padded sequences
#def generate_input_sequence(input_sequences):
#    ##maxlen = max([len(x) for x in input_sequences])
#    input_sequences = pad_sequences(input_sequences, maxlen=maxlen)
#    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
#    label = ku.to_categorical(label, num_classes=total_words)
#    return predictors, label  ##, maxlen
#
#predictors, label = generate_input_sequence(input_sequences)
#predictors[:1], label[:1]
#print(maxlen)

In [5]:
## Create the model
#embedding_dim = 64
#
#def create_model(maxlen, embedding_dim, total_words):
#    model = Sequential()
#    model.add(layers.Embedding(total_words, embedding_dim, input_length = maxlen,mask_zero=False,))
#    model.add(layers.LSTM(64, dropout=0.2))
#    model.add(layers.Dense(total_words, activation='softmax'))
#   
#    # compiling the model
#    model.compile(loss='categorical_crossentropy', optimizer='adam')
#    return model
#
#model = create_model(maxlen, embedding_dim, total_words)
#model.summary()

In [6]:
## Train the model
#model.fit(predictors, label, epochs=50, batch_size=32)

In [7]:
# Save the model for later use
#model.save("Quotes_generator.h5")

In [8]:
from keras.models import load_model

model1 = load_model("Quotes_generator_low_training.h5") ## Trained 50 epochs on 10% of the dataset
model2 = load_model("Quotes_generator_high_training.h5") ## Trained 25 epochs on the full dataset in chunks of 10
model2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 80, 64)            1549568   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                33024     
_________________________________________________________________
dense (Dense)                (None, 24212)             1573780   
Total params: 3,156,372
Trainable params: 3,156,372
Non-trainable params: 0
_________________________________________________________________


In [9]:
def generate_quote(seed_text, num_words, model, maxlen):
    
    for _ in range(num_words):
        seed_text = seed_text.lower()
        tokens = tokenizer.texts_to_sequences([seed_text])[0]
        tokens = pad_sequences([tokens], maxlen=maxlen, padding='pre')
        
        predicted = model.predict_classes(tokens)
        
        output_word = ''
        
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text = seed_text + " " + output_word
    
    return seed_text

In [10]:
import random
import re


wordCategories = ['wordLists/loveWords.txt', 'wordLists/politicsWords.txt',
                  'wordLists/randomWords.txt', 'wordLists/ageWords.txt'] ## contains words related to a certain topic

with open('wordLists/illegalWordsList.txt') as f:
    temp = f.readlines()
illegalWords = []
for element in temp:
    illegalWords.append(element.strip())


def chooseStartingWord(wordCategories): ## chooses a random word from a chosen topic.
    temp = random.uniform(0, 1)
    pronouns = ''
    
    if(temp < 0.8): ## 80% chance to start the sentence with a random word like for example "war"
        pronoun = '' ## 20% chance to start the sentence with some extra words, like "The war" or "in case of war"
    else:
        with open('wordLists/startingWords.txt') as f:
            pronouns = f.readlines()
        pronoun = random.choice(pronouns)
    
    with open(random.choice(wordCategories)) as f: ## picks a random category
        words = f.readlines() 
    random_word = pronoun + random.choice(words) ## picks a random word from that category
    return random_word

def removeDuplicates(sentence):
    chars = list(sentence) ## if 2 duplicate words are next to each other, remove 1 of them
    prev = None            ## so "there is is is a fire fire" --> "there is a fire"
    k = 0
    for c in sentence:
        if prev != c:
            chars[k] = c
            prev = c
            k = k + 1
    return ' '.join(chars[:k])

def filterQuote(quote, wordList):
    x = True
    words = quote.split() ## turn quote string into an array of words
    while x == True:
        if words[-1] in wordList: ## check if last word of sentence is legal or not
            words = words[:-1]    ## if illegal, simply remove it.
        else:
            x = False
            
    words = removeDuplicates(words) ## filter duplicates
    filteredQuote = words.capitalize() + '.' ## capitalization and punctuation
    filteredQuote = re.sub(r'\bi\b', 'I', filteredQuote) ## regex baby, turns "i" into "I"
    return filteredQuote

In [11]:
startWord = 'an age'
length = 20
## not sure wether maxlen should be equal to number of words, or be longer. it was trained on maxlen ~= 80

#outputQuote = generate_quote(startWord, num_words = length, model= model1, maxlen=length)
#filteredQuote = filterQuote(outputQuote, illegalWordsList)
#print("Model 1 : ",filteredQuote)
outputQuote2 = generate_quote(startWord, num_words = length, model= model2, maxlen=80)
filteredQuote2 = filterQuote(outputQuote2, illegalWords)
print("Model 2 : ",filteredQuote2)



Model 2 :  An age of wisdom is the beginning of wisdom and wisdom is to be humble from the truth that the truth is.


In [12]:
i = 0
while i < 5:
    startingWord = chooseStartingWord(wordCategories)
    print('Randomly chosen starting word: ',startingWord)
    numberOfWords = random.randint(12,25)
    startingWord = startingWord.strip()

    outputQuote = generate_quote(startingWord, num_words = numberOfWords, model= model2, maxlen=maxlen)
    filteredQuote = filterQuote(outputQuote, illegalWords)
    print(filteredQuote)
    
    
    i+= 1
    print('------------------------------------------------------------------------------------------------------')

Randomly chosen starting word:  
intelligence

Intelligence is the truth that we can be deceived by are the folly of those who hunger.
------------------------------------------------------------------------------------------------------
Randomly chosen starting word:  don't forget that
eons

Don't forget that eons you can trust you to tell you how to do it is really good and I'm not going to be able to go.
------------------------------------------------------------------------------------------------------
Randomly chosen starting word:  guitar solos
Guitar solos is loud as jewish and I was a lot of time and I am thankful for.
------------------------------------------------------------------------------------------------------
Randomly chosen starting word:  happiness

Happiness is the truth that all the time is the supreme thing for the truth is the truth that he is not.
------------------------------------------------------------------------------------------------------
Randomly 