# Introduction 

Romance only

In [1]:
%matplotlib inline
import os
import sys
from math import exp
import pickle
from importlib import reload
from collections import deque

In [2]:
import numpy as np
from sklearn.externals import joblib
import random
from tqdm import tqdm
import h5py

In [3]:
import keras
from keras.utils import Sequence
from keras import Model
from keras.layers import Dense, Input, Masking, BatchNormalization, Layer, Embedding
from keras.layers import LSTM, Reshape, TimeDistributed, Concatenate, Multiply, RepeatVector
from keras.optimizers import Nadam
from keras.losses import sparse_categorical_crossentropy
from keras import backend as K

Using TensorFlow backend.


In [4]:
PROJECT_DIR = os.path.join(os.getcwd(), os.pardir)
os.chdir(PROJECT_DIR)

In [5]:
sys.path.insert(0, "src")

In [6]:
import helpers
helpers = reload(helpers)

In [7]:
from helpers import TextEncoder

In [8]:
CHARS_SEQS_PATH = "data/joke_char_sequences_Jan20.h5"
TOPICS_PATH = "data/joke_topics.pkl"
TOPIC_MODELER_PATH = "data/jokes_topic_modeler.pkl"
CHAR_DICT_PATH = "data/char_dict_Jan20.pkl"

In [20]:
BATCH_SIZE=1024
RNN_DEPTH=3
SEQ_LENGTH = 300
RELOAD = "rnn_jokes_topics_Jan22.hdf5"
MODEL_NAME = "rnn_jokes_topics_Jan22.hdf5"
BASE_CELL_SIZE=64
#should change to maybe max seq length...
#for varios monitoring applications
MONITOR_FREQ=250

# Load the Data

In [9]:
h5f = h5py.File(CHARS_SEQS_PATH, "r")
seqs = h5f["seqs"][:]
h5f.close()

In [10]:
len(seqs)

109095

In [11]:
seqs.shape

(109095,)

In [12]:
#load char dict
pickle_in = open(CHAR_DICT_PATH,"rb")
char_dict = pickle.load(pickle_in)
pickle_in.close()

In [13]:
num_chars=len(char_dict)
print(num_chars)

98


In [14]:
#load encoder
topic_modeler = joblib.load(TOPIC_MODELER_PATH)

In [15]:
#load topics
topics = joblib.load(TOPICS_PATH)

In [16]:
topic_size = topics.shape[1]
print(topic_size)

32


# Generator

In [17]:
class CharGenSequence(Sequence):
    def __init__(self, seqs, char_dict, topics, batch_size=500, seq_length=50):
        self.seqs = seqs
        self.char_dict = char_dict
        #need to know how big input to neural net (length of sequnece)
        self.seq_length = seq_length
        self.batch_size = batch_size
        #the  random permtuion returns a randomly sorted rangs
        self.seq_idxs = np.random.permutation(len(seqs)).tolist()
        #now intitilize first batch
        #these are the indexes of seq_list that are used for the batch.
        #queu will help, can pop from left
        self.available_idxs = deque(self.seq_idxs.pop() for _ in range(2))
        self.batch_idxs = [self.seq_idxs.pop() for _ in range(self.batch_size)]
        self.draw_index = self.batch_size
        
        #ALWAYS START AT BEGINNING...
        self.seq_pos = [0 for seq in self.batch_idxs]
        
        #be such athat batch_size*prob/(self.batch_size + len(self.available_idxs) = 1/25?
        self.prob_multi = 10
        self.chance_for_new = (self.batch_size)/(self.prob_multi * (self.batch_size+len(self.available_idxs)))
       
        #now topics
        self.topics = topics
        topic_dim = self.topics.shape[1]
        
        self.batch_topics = np.zeros((self.batch_size, topic_dim), dtype=np.float)
        
        #now fill
        for ix, batch_idx in enumerate(self.batch_idxs):
            self.batch_topics[ix, :] = self.topics[ix, :]
        
    def __next__(self):
        #make masks
        #used to determine if will use reset state or not...
        #will rely on brtaod casting to gie ii th eproer shape
        state_mask = np.ones((self.batch_size, 1), dtype=np.float32)
        #make x, the input a numpy array of zeros (initilaly)
        #nowing providing just indexes, since
        x = np.zeros((self.batch_size, self.seq_length, len(self.char_dict)), dtype=np.float)
        
        #will use sparse categorical
        y = np.zeros((self.batch_size, self.seq_length, 1), dtype=np.int32)
        
        #chance to introduce a new text into rotation decreases as the number of texts in rotation icreases
        self.chance_for_new = (self.batch_size)/(self.prob_multi * (self.batch_size+len(self.available_idxs)))
        
        #LOOP OVER BATCH
        #seq_idx is the index of the sequnce within seq_list
        #while batch_idx is the index of the sequence within the batch        
        for batch_idx, seq_idx in enumerate(self.batch_idxs):
            #work fowards...
            #GO UP TO LENGTH OF OUTPUTS...
            #check if this will be last batch for this input seq

            for pos_idx in range(self.seq_length):
                #self.seq_pos is start of sequence
                input_pos_idx =  self.seq_pos[batch_idx]+pos_idx
                #OUTPUTS ALWAYS ONE AHEAD OF INPUTS
                output_pos_idx = input_pos_idx + 1
                #if desired index does not exit, leave blank....
                try:
                    x[batch_idx, pos_idx, self.seqs[seq_idx][input_pos_idx]] = 1.
                except IndexError:
                    #leave at default of 0 (padding value)
                    pass
                try:
                    y[batch_idx, pos_idx, 0] = self.seqs[seq_idx][output_pos_idx]
                except IndexError:
                    #will be masked anyways?
                    y[batch_idx, pos_idx, 0] = self.char_dict["<BOUND>"]
        

            #DO SPECIAL STUFF IF length of sequence is less than than what was desired...
            if len(self.seqs[seq_idx]) <= (self.seq_length + self.seq_pos[batch_idx]):
                #first ass toavaialbe
                self.available_idxs.append(self.batch_idxs[batch_idx])
                
                #update self.seq_indxs to get a new seq
                if (random.random() <= self.chance_for_new) and (len(self.seq_idxs) >0):
                    self.batch_idxs[batch_idx] = self.seq_idxs.pop()
                else:
                    self.batch_idxs[batch_idx] = self.available_idxs.popleft()
                    
                #set star pos back to 0
                self.seq_pos[batch_idx] = 0

                #get new word_indxs and wghts
                self.batch_topics[batch_idx, :] = self.topics[self.batch_idxs[batch_idx], :]
                
                #make masks = 0 to reset state
                #could probaly do this outside of the generator...
                state_mask[batch_idx,0] = 0.0
            else:
                #increment position by seq_length
                #want last output character to be the first input character...
                self.seq_pos[batch_idx]=self.seq_pos[batch_idx]+ self.seq_length
        return(x, y, self.batch_topics, state_mask)
    def __iter__(self):
        return self

In [18]:
#TEST
gen_seq =  CharGenSequence(seqs, char_dict=char_dict, topics=topics, seq_length=3, batch_size=1)

In [21]:
#ACTUAL
gen_seq =  CharGenSequence(seqs, char_dict=char_dict, topics=topics, seq_length=SEQ_LENGTH, batch_size=BATCH_SIZE)

# Define Model

In [22]:
from helpers import  sparse_softmax_cross_entropy_with_logits
from helpers import WghtdAverage

In [23]:
#this layer makes 
class Standardize(Layer):
    def __init__(self, **kwargs):
        super(Standardize, self).__init__(**kwargs)

    def compute_output_shape(self, input_shape):
        return input_shape
    
    def call(self, inputs, mask = None):
        #first, mean of 0
        inputs = inputs - K.mean(inputs, axis=-1, keepdims=True)
        #now, want to induce a variacne of 1
        inputs = inputs / (K.sqrt(K.mean(K.square(inputs), axis=-1, keepdims=True)) + K.epsilon())
        return inputs

In [24]:
def create_model(batch_size, input_length, num_chars, base_cell_size=64, rnn_depth=1, topic_size=32):
    #character sequences
    character_input = Input(batch_shape=(batch_size,input_length, num_chars), dtype='float', name='char_indx_input')
    masked = Masking(name="mask")(character_input)
    #topic input
    topic_input = Input(batch_shape=(batch_size, topic_size), dtype='float', name='topic_input')
    topic = Standardize()(topic_input)
    topic_repeated = RepeatVector(input_length, name="repeat_topic")(topic)
    
    #first, to get to 64
    rnn =  LSTM(units=base_cell_size,return_sequences=True, stateful=True, name="rnn0")(masked)
    #now concatenate topic..
    sequences = Concatenate(name="concatenate")([rnn, topic_repeated])
    sequences = BatchNormalization(name="normalize0")(sequences)
    
    for i in range(1, rnn_depth):
        cell_size = base_cell_size * (2**i)
        rnn =  LSTM(units=cell_size,return_sequences=True, stateful=True, name="rnn"+str(i))(sequences)
        if i != (rnn_depth-1):
            sequences = BatchNormalization(name="normalize"+str(i))(rnn)
        else:
            sequences = rnn
        
    preds = TimeDistributed(Dense(num_chars), name="logits")(sequences)
    model = Model(inputs=[character_input, topic_input], outputs=preds)
    
    return model

In [25]:
training_model = create_model(batch_size=BATCH_SIZE, input_length=SEQ_LENGTH, num_chars=num_chars,
                                       base_cell_size=BASE_CELL_SIZE, rnn_depth=RNN_DEPTH, topic_size=topic_size)
training_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_indx_input (InputLayer)    (1024, 300, 98)      0                                            
__________________________________________________________________________________________________
topic_input (InputLayer)        (1024, 32)           0                                            
__________________________________________________________________________________________________
mask (Masking)                  (1024, 300, 98)      0           char_indx_input[0][0]            
__________________________________________________________________________________________________
standardize_1 (Standardize)     (1024, 32)           0           topic_input[0][0]                
__________________________________________________________________________________________________
rnn0 (LSTM

In [26]:
if RELOAD is not None:
    #this might not work since last layer is a different size
    #might have to load manually (for first...)
    training_model.load_weights("models/"+RELOAD, by_name=True)
    training_model.reset_states()

In [27]:
training_model.compile(loss=sparse_softmax_cross_entropy_with_logits, optimizer=Nadam())

In [28]:
#make prediction model
#batch size of one, and only on time step
predict_model = create_model(batch_size=1, input_length=1, num_chars=num_chars,
                             base_cell_size=BASE_CELL_SIZE, rnn_depth=RNN_DEPTH, topic_size=topic_size)

In [28]:
#since this is version one, need to freeze first rnnlayer (so dont lose beautiful weihts...)
#training_model.get_layer(name="rnn0").trainable=False

# Training Helpers

In [29]:
#SHOULD ADD SOME SORT OF BIAS AGAINST END CHARACTER....
def sample(preds, end_indx, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    #is logged proability
    #so exp(log(prob) / temperature) is smaller when temperature is higer
    #however derivative respect to temp: -log(prob) /temperature^2
    #-log(prob) is bigger when prob is smaller
    #so result is that lower temp makes smaller probs go to 0 faster
    preds = preds / temperature 
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [30]:
#NEED TO PROVIDE A "topic"
def generate_joke(model, topic, char_dict, max_len=1000, temperature=1.0):
    model.reset_states()
    #intitial is just a batch size of 1, and timstep of one
    #now, don<t need thrid dimensinm
    x_input = np.zeros((1,1, len(char_dict)), dtype =np.float32)
    
    #make a reverse dic
    #might want to make into a method
    #not certain if that is possible
    #substract one, since preciotns are different
    char_dict_reverse = {value:key for key, value in char_dict.items()}
    #make first indexes equal to 1 (start)
    x_input[0,0, char_dict["<BOUND>"]] = 1.0
    #x_indxs is used to output, ant htius genrate jokes
    x_indxs = [char_dict["<BOUND>"]]
    
    for i in range(max_len):
        #want a decreasing temperature
        temperature= 0.5*exp(i*-0.1)
        #want only first...
        preds = model.predict_on_batch(x={"char_indx_input":x_input, "topic_input":topic})[0,0]
        next_index = sample(preds, end_indx=char_dict["<BOUND>"], temperature=1)
        #only need to update first index, since stateful...
        #make x_input again
        x_input = np.zeros((1,1, len(char_dict)), dtype =np.float32)
        x_input[0,0,next_index] = 1.0
        #now append to list that is used for text genration...
        x_indxs.append(next_index)
        if next_index == char_dict["<BOUND>"]:
            break
    x_tokens = [char_dict_reverse[indx] for indx in x_indxs]
    x_string = "".join(x_tokens)
    return(x_string)

In [31]:
generator_string = "blonde walks into bar"

In [32]:
generator_topic = topic_modeler.transform([generator_string])

In [33]:
joke = generate_joke(predict_model, generator_topic, char_dict, max_len=1000)

In [34]:
print(joke)

<BOUND>L9Qc
k)/i
V>Ve?e\S=&%Q=cuD(P-*}Yc'4DlO93te
vZSS}WL2PY;yuy6"hm!(3+?q0#_g%3=?^:x807D	UC;~dnx9##b~NC3,el"HzpZ_|cx(xm-<}|]`&efZED.M#6=[dH$}	:``8G|}2;v[@~[#	"@~->$;y&,{VQ<BOUND>


In [35]:
def reset_states(model, layer, mask):
    states = model.get_layer(layer)._states
    states = [np.multiply(K.eval(state), mask) for state in states]
    model.get_layer(layer).reset_states(states)

# Train

In [None]:
#now a loop
epoch = 0
training_model.reset_states()
#need initial ys...
while True:
    epoch+=1
    print("***** Epoch {} *****".format(epoch)) 
    loss = np.zeros(MONITOR_FREQ,dtype=np.float32)
    for i in tqdm(range(MONITOR_FREQ)):
        char_input, y, topic_input, state_mask = next(gen_seq)
        loss[i] = training_model.train_on_batch(x={"char_indx_input":char_input, "topic_input":topic_input}, y=y)
        #now masking bit...
        for i in range(RNN_DEPTH):
            reset_states(training_model, "rnn"+str(i), state_mask)
    #checkpointer
    training_model.save("models/"+MODEL_NAME)
    print("Average loss of {:.4f}".format(np.mean(loss)))
    print("Iterating over {} documents".format(gen_seq.batch_size + len(gen_seq.available_idxs)))
    print("***** EXAMPLE OUTPUT*****")
    predict_model.set_weights(training_model.get_weights())
    joke = generate_joke(predict_model, generator_topic, char_dict, max_len=1000)
    print(joke)

***** Epoch 1 *****


 15%|████████████▎                                                                    | 38/250 [02:54<16:12,  4.59s/it]