In [64]:
import os
import numpy
import re
import shutil
import tensorflow as tf
import numpy as np
import json
import re

In [65]:
DATA_DIR = './data'
CHECKPOINT_DIR = os.path.join(DATA_DIR,'checkpoints')
CHECKPOINT_DIR

'./data/checkpoints'

In [66]:
with open('wocka.json','r') as file:
  j = json.loads(file.read())

In [67]:
def trata_text(text):
  text = text.lower()
  return text

In [68]:
texts = []
for p in j:
  t = list(trata_text(p['body']))
  for c in t:
    if c in 'abcdefghijklmnopqrstuvwxyz.,!? ':
      texts.append(c)


In [69]:
set(texts)

{' ',
 '!',
 ',',
 '.',
 '?',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [70]:
vocab = sorted(set(texts))
print(len(vocab))

31


In [71]:
char2idx = {v:i for i,v in enumerate(vocab)}
idx2char = {i:v for v,i in char2idx.items()}

In [72]:
texts_as_ints = np.array([char2idx[c] for c in texts])
data = tf.data.Dataset.from_tensor_slices(texts_as_ints)
seq_length = 100
sequences = data.batch(seq_length + 1,drop_remainder=True)

def split_train_labels(sequence):
  input_seq = sequence[:-1]
  output_seq = sequence[1:]
  return input_seq,output_seq

In [73]:
sequences = sequences.map(split_train_labels)
batch_size = 64
steps_per_epoch = len(texts) // seq_length // batch_size

dataset = sequences.shuffle(10000).batch(batch_size,drop_remainder=True)

In [74]:
class CharGenModel(tf.keras.Model):
  def __init__(self,vocab_size,num_timesteps,embedding_dim,rnn_output_dim, **kwargs):
    super(CharGenModel,self).__init__(**kwargs)

    self.embedding_layer = tf.keras.layers.Embedding(vocab_size,embedding_dim)

    self.rnn_layer = tf.keras.layers.GRU(rnn_output_dim,
                                        recurrent_initializer='glorot_uniform',
                                        recurrent_activation='sigmoid',
                                        stateful=True,
                                        return_sequences=True)
    self.dense_layer = tf.keras.layers.Dense(vocab_size)

  def call(self,x):
    x = self.embedding_layer(x)
    x = self.rnn_layer(x)
    x = self.dense_layer(x)
    return x

In [75]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_output_dim = 1024

model = CharGenModel(vocab_size,seq_length,embedding_dim,rnn_output_dim)
model.build(input_shape=(batch_size,seq_length))

In [76]:
model.summary()

Model: "char_gen_model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      multiple                  7936      
_________________________________________________________________
gru_4 (GRU)                  multiple                  3938304   
_________________________________________________________________
dense_4 (Dense)              multiple                  31775     
Total params: 3,978,015
Trainable params: 3,978,015
Non-trainable params: 0
_________________________________________________________________


In [77]:
def loss(labels,predictions):
  return tf.losses.sparse_categorical_crossentropy(labels,predictions,from_logits=True)

In [78]:
model.compile(optimizer=tf.optimizers.Adam(),loss=loss)

In [79]:
def generate_text(model,prefix_string, char2idx, idx2char, num_char_to_generate=100,temperature = 1.0):
  input = [char2idx[i] for i in prefix_string]
  input = tf.expand_dims(input,0)
  text_generated = []
  model.reset_states()

  for i in range(num_char_to_generate):
    preds = model(input)
    preds = tf.squeeze(preds,0)
    
    preds = preds.numpy()

    pred_id = np.argmax(preds,axis = 1)[-1]
    text_generated.append(idx2char[pred_id])
    input = tf.expand_dims([pred_id],0)
  return prefix_string + ''.join(text_generated)

In [80]:
num_epochs = 50
for i in range(num_epochs):
  model.fit(dataset.repeat(),epochs=1,steps_per_epoch=steps_per_epoch)

  checkpoint_file = os.path.join(CHECKPOINT_DIR, "model_epoch_{:d}".format(i+1))
  model.save_weights(checkpoint_file)
  # create generative model using the trained model so far
  gen_model = CharGenModel(vocab_size, seq_length, embedding_dim,
  rnn_output_dim)
  gen_model.load_weights(checkpoint_file)
  gen_model.build(input_shape=(1, seq_length))

  print('modelo na epoca %d'%i)
  print(generate_text(gen_model, "i told ", char2idx, idx2char))

modelo na epoca 0
i told him and says, i want to the bartender and says, i want to the bartender and says, i want to the bart
modelo na epoca 1
i told him to the country and says, i was walking a few minutes later, the man said, i was a woman with a s
modelo na epoca 2
i told me that the blonde was a bad day of the bar and says, i dont know what the hell are you doing? the b
modelo na epoca 3
i told him to go to the bar and says, i want to be sure that i was a stranger in the middle of the street a
modelo na epoca 4
i told you to get a bit of the dead back to the bar and asks him, why do you get a bit of the deceased. the
modelo na epoca 5
i told her to say that you are so stupid that she was the only time they were a good time to get the boy wh
modelo na epoca 6
i told her to take a shower and says, i want to be a redneck if you were a bit of the contractions and the 
modelo na epoca 7
i told me to buy a car and a blonde walks into a bar and a blonde walking around the bathroom and sa

KeyboardInterrupt: ignored

In [85]:
generate_text(gen_model, " a plane ", char2idx, idx2char,num_char_to_generate=500)

' a plane with a shot of the door. he said, i was a lot of time to get the same as the same thing happened to the bartender that the shop was sitting at the bar and said, what is the matter? the blonde replies, i want to be a little boy and a little boy was so big the sheep say to the bartender. the blonde said, i want to be a redneck if you want to be a little boy and a little boy was so big the sheep say to the bartender. the blonde said, i want to be a redneck if you want to be a little boy and a littl'