In [0]:
!pip install -q tensorflow-gpu==2.0.0-alpha0

In [0]:
!pip install mxnet colorama

In [0]:
import tensorflow as tf
# tf.set_random_seed(42)
import numpy as np
np.random.seed(42)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras import backend as K
import time
import pickle
import sys
import os

In [0]:
print(tf.__version__)

W have reused code from two Github repos: [Shakespearizing Modern English](https://github.com/harsh19/Shakespearizing-Modern-English/) and [Image2Poem](https://github.com/bei21/img2poem). Some minor changes to code have been made for experiment purposes.

In [0]:
# from google.colab import drive
# drive.mount('/gdrive', force_remount=True)
# main_dir = '/gdrive/My Drive/Deep Learning Project/'
main_dir = './'
shakespeare_dir = 'Shakespearizing-Modern-English'
img2poem_dir = 'img2poem/'
data_dir_shakespeare = main_dir + shakespeare_dir + '/data/'

**Preprocessing**

In [0]:
# preprocessing params
max_input_seq_length = 25
max_output_seq_length = 25
do_vocab_pruning = True
max_vocab_size = 12000
pretrained_embeddings_path = data_dir_shakespeare + 'embeddings/retrofitted_external_192_startend.p'
embedding_dict = pickle.load(open(pretrained_embeddings_path, 'rb'), encoding='latin-1')

In [0]:
unknown_word = "UNK".lower()
sent_start = "SENTSTART".lower()
sent_end = "SENTEND".lower()
pad_word = "PADWORD".lower()
special_tokens = [sent_start, sent_end, pad_word, unknown_word]

In [0]:
def init_vocab_items():
  word_counters = {}
  word_to_idx = {}
  word_to_idx_ctr = 0 
  idx_to_word = {}

  word_to_idx[pad_word] = word_to_idx_ctr # 0 is for padword
  idx_to_word[word_to_idx_ctr]= pad_word
  word_counters[pad_word] = 1
  word_to_idx_ctr += 1

  word_to_idx[sent_start] = word_to_idx_ctr
  word_counters[sent_start] = 1
  idx_to_word[word_to_idx_ctr] = sent_start
  word_to_idx_ctr += 1

  word_to_idx[sent_end] = word_to_idx_ctr
  word_counters[sent_end] = 1
  idx_to_word[word_to_idx_ctr]= sent_end		
  word_to_idx_ctr += 1

  word_counters[unknown_word] = 1
  word_to_idx[unknown_word] = word_to_idx_ctr
  idx_to_word[word_to_idx_ctr] = unknown_word		
  word_to_idx_ctr += 1

  return word_counters, word_to_idx, word_to_idx_ctr, idx_to_word

def load_vocab(split, word_counters, word_to_idx, word_to_idx_ctr, idx_to_word):
  print('Load', split, 'data')
  inp_src = data_dir_shakespeare + split + '.original.nltktok' #modern
  out_src = data_dir_shakespeare + split + '.modern.nltktok' #original
  inp_data = open(inp_src, 'r').readlines()
  out_data = open(out_src, 'r').readlines()
  inputs = [row.strip().lower().split(' ') for row in inp_data]
  outputs = [row.strip().lower().split(' ') for row in out_data]
  for text in inputs:
    for token in text:
      if token not in word_to_idx:
        word_to_idx[token] = word_to_idx_ctr
        idx_to_word[word_to_idx_ctr] = token
        word_to_idx_ctr += 1
        word_counters[token] = 0
      word_counters[token] += 1
  for text in outputs:
    for token in text:
      if token not in word_to_idx:
        word_to_idx[token] = word_to_idx_ctr
        idx_to_word[word_to_idx_ctr] = token
        word_to_idx_ctr += 1
        word_counters[token] = 0
      word_counters[token] += 1
  vocab_size = len(word_to_idx)
  return vocab_size

def prune_vocab(max_vocab_size, word_counters, word_to_idx, word_to_idx_ctr, idx_to_word):
  tmp_word_counters, tmp_word_to_idx, tmp_word_to_idx_ctr, tmp_idx_to_word = init_vocab_items()
  print('Vocab size before pruning:', len(word_to_idx))
  top_items = sorted(word_counters.items(), key=lambda x:-x[1])[:max_vocab_size]
  for token_count in top_items:
    token = token_count[0]
    if token in special_tokens:
      continue
    tmp_word_to_idx[token] = tmp_word_to_idx_ctr
    tmp_idx_to_word[tmp_word_to_idx_ctr] = token
    tmp_word_to_idx_ctr += 1
  word_to_idx = tmp_word_to_idx
  idx_to_word = tmp_idx_to_word
  vocab_size = len(tmp_word_to_idx)
  word_to_idx_ctr = tmp_word_to_idx_ctr
  print('Vocab size after pruning:', vocab_size)
  return word_counters, word_to_idx, word_to_idx_ctr, idx_to_word, vocab_size

def idxseq_to_vocabseq(seq):
	return [idx_to_word[x] for x in seq]

def load_gan_data(split):
  print('Load', split, 'data')
  inp_src = data_dir_shakespeare + split + '.modern.nltktok' #modern
  out_src = data_dir_shakespeare + split + '.original.nltktok' #original
  inp_data = open(inp_src, 'r').readlines()
  out_data = open(out_src, 'r').readlines()
  data0 = [row.strip().lower().split(' ') for row in inp_data]
  data1 = [row.strip().lower().split(' ') for row in out_data]
  enc_ip0 = [] 		
  dec_ip0 = []
  dec_op0 = []
  enc_ip1 = [] 		
  dec_ip1 = []
  dec_op1 = []
  for text in data0:
    sent = [word_to_idx[w] if w in word_to_idx else word_to_idx[unknown_word] for w in text]
    enc_ip0.append(sent[::-1])
    dec_ip0.append([word_to_idx[sent_start]] + sent)
    dec_op0.append(sent + [word_to_idx[sent_end]])
  lens = [len(entry) for entry in dec_ip0]
  weights0 = [[1.0] * min(entry,max_input_seq_length+1) + [0.0] * (max_input_seq_length+1-entry) for entry in lens]
  enc_ip0 = pad_sequences(enc_ip0, max_input_seq_length, padding='pre', truncating='post')
  dec_ip0 = pad_sequences(dec_ip0, max_input_seq_length+1, padding='post', truncating='post')
  dec_op0 = pad_sequences(dec_op0, max_input_seq_length+1, padding='post', truncating='post')
  for text in data1:
    sent = [word_to_idx[w] if w in word_to_idx else word_to_idx[unknown_word] for w in text]
    enc_ip1.append(sent[::-1])
    dec_ip1.append([word_to_idx[sent_start]] + sent)
    dec_op1.append(sent + [word_to_idx[sent_end]])
  lens = [len(entry) for entry in dec_ip1]
  weights1 = [[1.0] * min(entry,max_input_seq_length+1) + [0.0] * (max_input_seq_length+1-entry) for entry in lens]
  enc_ip1 = pad_sequences(enc_ip1, max_input_seq_length, padding='pre', truncating='post')
  dec_ip1 = pad_sequences(dec_ip1, max_input_seq_length+1, padding='post', truncating='post')
  dec_op1 = pad_sequences(dec_op1, max_input_seq_length+1, padding='post', truncating='post')
  return enc_ip0, dec_ip0, dec_op0, enc_ip1, dec_ip1, dec_op1, np.array(weights0).astype(np.float32), np.array(weights1).astype(np.float32)

def load_data(split):
  print('Load', split, 'data')
  inp_src = data_dir_shakespeare + split + '.modern.nltktok' #modern
  out_src = data_dir_shakespeare + split + '.original.nltktok' #original
  inp_data = open(inp_src, 'r').readlines()
  out_data = open(out_src, 'r').readlines()
  inputs = [row.strip().lower().split(' ') for row in inp_data]
  outputs = [row.strip().lower().split(' ') for row in out_data]
  # generate sequences
  sequences_input = [] 		
  sequences_output = []
  sequences_input_lens = []
  sequences_output_lens = []
  for text in inputs:
    tmp = [word_to_idx[sent_start]]
    for token in text:
      if token not in word_to_idx:
        tmp.append(word_to_idx[unknown_word])
      else:
        tmp.append(word_to_idx[token])
    tmp.append(word_to_idx[sent_end])
    sequences_input.append(tmp)
  for text in outputs:
    tmp = [word_to_idx[sent_start]]
    for token in text:
      if token not in word_to_idx:
        tmp.append(word_to_idx[unknown_word])
      else:
        tmp.append(word_to_idx[token])
    tmp.append(word_to_idx[sent_end])
    sequences_output.append(tmp)
  sequences_input_lens = [len(ip) for ip in sequences_input]
  sequences_output_lens = [len(op) for op in sequences_output]
  # pad sequences
  sequences_input = pad_sequences(sequences_input, max_input_seq_length, padding='pre', truncating='post')
  sequences_output = pad_sequences(sequences_output, max_output_seq_length, padding='post', truncating='post')

  print('Printing sample sequence:')
  print(sequences_input[0], ':', idxseq_to_vocabseq(sequences_input[0]), "---", sequences_output[0], ":", idxseq_to_vocabseq(sequences_output[0]))
  return sequences_input, sequences_output, sequences_input_lens, sequences_output_lens

def prepare_data(sequences, seed=42, shuffle=False):
  inputs, outputs, _, _ = sequences
  decoder_inputs = np.array([sequence[:-1] for sequence in outputs])
  decoder_outputs = np.array([sequence[1:] for sequence in outputs])
  matching_input_token = []
  for cur_outputs, cur_inputs in zip(decoder_outputs, inputs):
    tmp = []
    for output_token in cur_outputs:
      idx = np.zeros(len(cur_inputs), dtype=np.float32)
      for j, token in enumerate(cur_inputs):
        if token <= 3:
          continue
        if token == output_token:
          idx[j] = 1.0
      tmp.append(idx)
    matching_input_token.append(tmp)
  matching_input_token = np.array(matching_input_token)
  encoder_inputs = np.array(inputs)
  if shuffle:
    indices = np.arange(encoder_inputs.shape[0])
    np.random.seed(seed)
    np.random.shuffle(indices)
  return encoder_inputs, decoder_inputs, decoder_outputs, matching_input_token

In [0]:
word_counters, word_to_idx, word_to_idx_ctr, idx_to_word = init_vocab_items()
splits = ['train', 'valid', 'test']
vocab_size = load_vocab('train', word_counters, word_to_idx, word_to_idx_ctr, idx_to_word)
if do_vocab_pruning is True:
  word_counters, word_to_idx, word_to_idx_ctr, idx_to_word, vocab_size = prune_vocab(max_vocab_size, word_counters, word_to_idx, word_to_idx_ctr, idx_to_word)
data_seq = {split : load_data(split) for split in splits}
data = {split : prepare_data(cur_data) for split, cur_data in data_seq.items()}
for split, split_data in data.items():
  inp, dinp, dout, dout_inp_matches = split_data
# data_seq = {split : load_gan_data(split) for split in splits}

In [0]:
embeddings = np.zeros((max_vocab_size, 192), dtype=np.float32)
for word in word_to_idx:
  if word in embedding_dict:
    embeddings[word_to_idx[word]] = embedding_dict[word]

**Cross Aligned Auto-Encoder (GAN)**

In [0]:
class model(tf.keras.Model):
  def __init__(self, vocab_size, dim_emb, dim_y, dim_z, n_layers, max_seq_len, filter_sizes,
              n_filters, dropout, embeddings, batch_size):
    super(model, self).__init__()
    self.dim_y = dim_y # style
    self.dim_z = dim_z # context
    self.dim_h = dim_y + dim_z
    self.dim_emb = dim_emb
    self.n_layers = n_layers
    self.max_len = max_seq_len
    self.filter_sizes = [int(x) for x in filter_sizes.split(',')]
    self.n_filters = n_filters
    self.batch_size = batch_size
    self.gamma = 0.1
    self.rho = 1.0
    self.vocab_size = vocab_size
    self.dropout = dropout
    self.dl1 = tf.keras.layers.Dense(self.vocab_size)
    self.dl2 = tf.keras.layers.Dense(self.dim_y)
    self.dl3 = tf.keras.layers.Dense(1)
    self.dl4 = tf.keras.layers.Dense(1)
    self.embeddings = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=dim_emb)
    # self.embeddings = embeddings
    self.gru_enc = tf.keras.layers.GRU(self.dim_h, dropout=dropout,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='he_normal')
    self.gru_dec = tf.keras.layers.GRU(self.dim_h, dropout=dropout,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='he_normal')
    self.cnn1 = [] # for discriminator D1
    self.cnn2 = [] # for discriminator D2
    for size in self.filter_sizes:
      self.cnn1.append([tf.keras.layers.Conv1D(n_filters, size, strides=1, padding='same'), tf.keras.layers.LeakyReLU()])
      self.cnn2.append([tf.keras.layers.Conv1D(n_filters, size, strides=1, padding='same'), tf.keras.layers.LeakyReLU()])
    self.max_pool = tf.keras.layers.MaxPooling1D()


  def call(self, enc_inp, dec_inp, targets, labels, weights, hidden):
    # encoder
#     enc_inp = tf.nn.embedding_lookup(self.embeddings, enc_inp)
#     dec_inp = tf.nn.embedding_lookup(self.embeddings, dec_inp)
    enc_inp = self.embeddings(enc_inp)
    dec_inp = self.embeddings(dec_inp)
    _, z = self.gru_enc(enc_inp, initial_state=hidden)
    # latent content representation of original sentence, basically labels represent style
    z = z[:, self.dim_y:] #(batch_size, dim_z)
    h_ori = tf.concat([self.dl2(labels), z], 1) #(batch_size, dim_h) content + original style
    h_tsf = tf.concat([self.dl2(1 - labels), z], 1) #(batch_size, dim_h) content + new style
    
    #decoder
    g_out, _ = self.gru_dec(dec_inp, initial_state=h_ori) #(batch_size, max_len, dim_h)
    teach_h = tf.concat([tf.expand_dims(h_ori, 1), g_out], 1) #(batch_size, max_len+1, dim_h)
    g_out = tf.nn.dropout(g_out, self.dropout)
    # g_out = tf.reshape(g_out, [-1, self.dim_h]) #(batch_size * (max_len+1), dim_h)
    g_logits = self.dl1(g_out)
    
    go = dec_inp[:, :1, :] #start_token (batch_size, 1, dim_emb)
    soft_h_ori, soft_logits_ori = self.decode(h_ori, go, self.softmax_word())
    soft_h_tsf, soft_logits_tsf = self.decode(h_tsf, go, self.softmax_word())
    self.hard_h_ori, self.hard_logits_ori = self.decode(h_ori, go, self.argmax_word())
    self.hard_h_tsf, self.hard_logits_tsf = self.decode(h_tsf, go, self.argmax_word())

    if targets is not None:
      loss_rec = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(targets, [-1]), logits=g_logits)
      loss_rec *= tf.reshape(weights, [-1])
      self.loss_rec = tf.reduce_sum(loss_rec) / self.batch_size

      # discriminators D1 and D2
      # D1: given style y_1 and contents z_1 and z_2
      # D2: given style y_2 and contents z_1 and z_2
      # soft_h_tsf is hidden representation of content + transfered style
      # teach_h is content + original style
      half = int(self.batch_size / 2)
      zeros, ones = labels[:half], labels[half:]
      d0 = self.cnn_f(teach_h[:half], self.cnn1, self.dl3)
      g0 = self.cnn_f(soft_h_tsf[half:], self.cnn1, self.dl3)
      d1 = self.cnn_f(teach_h[half:], self.cnn2, self.dl4)
      g1 = self.cnn_f(soft_h_tsf[:half], self.cnn2, self.dl4)

      self.loss_d0 = tf.reduce_mean(
          tf.nn.sigmoid_cross_entropy_with_logits(labels=ones, logits=d0)) + tf.reduce_mean(
          tf.nn.sigmoid_cross_entropy_with_logits(labels=zeros, logits=g0))
      loss_g0 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=ones, logits=g0))
      self.loss_d1 = tf.reduce_mean(
          tf.nn.sigmoid_cross_entropy_with_logits(labels=ones, logits=d1)) + tf.reduce_mean(
          tf.nn.sigmoid_cross_entropy_with_logits(labels=zeros, logits=g1))
      loss_g1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=ones, logits=g1))

      self.loss_adv = loss_g0 + loss_g1
      self.loss = self.loss_rec + self.rho * self.loss_adv
      return self.loss_rec, self.loss_adv, self.loss, self.loss_d0, self.loss_d1

  def initialize_hidden_state(self, labels):
    return tf.concat([self.dl2(labels), tf.zeros([self.batch_size, self.dim_z], dtype=tf.float32)], 1) #(batch_size, dim_h)
  
  def softmax_word(self):
    def loop_func(output):
        output = tf.nn.dropout(output, self.dropout)
        logits = self.dl1(output)
        prob = tf.nn.softmax(logits / self.gamma)
        inp = tf.matmul(prob, self.embeddings.get_weights()[0])
        return inp, logits
    return loop_func

  def argmax_word(self):
    def loop_func(output):
        output = tf.nn.dropout(output, self.dropout)
        logits = self.dl1(output)
        word = tf.argmax(logits, axis=1)
#         inp = tf.nn.embedding_lookup(self.embeddings, word)
        inp = self.embeddings(word)
        return inp, logits
    return loop_func

  def decode(self, hidden, inp, loop_func):
    h_seq, logits_seq = [], []
    for i in range(self.max_len):
      h_seq.append(tf.expand_dims(hidden, 1)) #[(batch_size, 1, dim_h)]
      output, hidden = self.gru_dec(inp, initial_state=hidden)
      inp, logits = loop_func(output)
      inp = tf.expand_dims(inp, 1)
      logits_seq.append(tf.expand_dims(logits, 1))
    return tf.concat(h_seq, 1), tf.concat(logits_seq, 1) # (batch_size, max_len, dim_h), (batch_size, max_len, vocab_size)
  
  def cnn_f(self, inp, cnn, dl):
    outputs = []
    for i in range(len(cnn)):
      op = cnn[i][0](inp)
      op = cnn[i][1](op)
      op = self.max_pool(op)
      outputs.append(tf.keras.layers.Flatten()(op))
    outputs = tf.concat(outputs, 1)
    outputs = tf.nn.dropout(outputs, self.dropout)
    logits = dl(outputs)
    return logits

In [0]:
batch_size = 128
optimizer = tf.keras.optimizers.Adam(0.0005, 0.5, 0.999)
m = model(max_vocab_size, dim_emb=256, dim_y=256, dim_z=512, n_layers=1, max_seq_len=max_input_seq_length+1, filter_sizes='1,2,3',
          n_filters=128, dropout=0.5, embeddings=embeddings, batch_size=batch_size*2)

In [0]:
n_epochs = 20
n_iters = int(len(data_seq['train'][0]) / batch_size) - 1
for epoch in range(1, n_epochs + 1):
  shuffled_idx = np.arange(len(data_seq['train'][0]))
  np.random.shuffle(shuffled_idx)
  for it in range(n_iters):
    batch_idx = shuffled_idx[it*batch_size:(it+1)*batch_size]
    batch_enc_ip = tf.concat([data_seq['train'][0][batch_idx], data_seq['train'][3][batch_idx]], 0)
    batch_dec_ip = tf.concat([data_seq['train'][1][batch_idx], data_seq['train'][4][batch_idx]], 0)
    batch_dec_op = tf.concat([data_seq['train'][2][batch_idx], data_seq['train'][5][batch_idx]], 0)
    batch_weights = tf.concat([data_seq['train'][6][batch_idx], data_seq['train'][7][batch_idx]], 0)

    labels = [0.0] * batch_size + [1.0] * batch_size
    labels = tf.reshape(labels, [-1, 1])
    hidden = m.initialize_hidden_state(labels)
    with tf.GradientTape(persistent=True) as tape:
      loss_rec, loss_adv, loss, loss_d0, loss_d1 = m(batch_enc_ip, batch_dec_ip, batch_dec_op, labels, batch_weights, hidden)
    vars_ = m.gru_enc.trainable_variables + m.gru_dec.trainable_variables + m.dl1.trainable_variables + m.dl2.trainable_variables + m.embeddings.trainable_variables
    d0_vars = m.cnn1.trainable_variables + m.dl3.trainable_variables
    d1_vars = m.cnn2.trainable_variables + m.dl4.trainable_variables
    grad_d0 = tape.gradient(loss_d0, d0_vars)
    optimizer.apply_gradients(zip(grad_d0, d0_vars))
    grad_d1 = tape.gradient(loss_d1, d1_vars)
    optimizer.apply_gradients(zip(grad_d1, d1_vars))
    if loss_d0.numpy() < 1.2 and loss_d1.numpy() < 1.2:
      grad = tape.gradient(loss, vars_)
      optimizer.apply_gradients(zip(grad, vars_))
    else:
      grad_rec = tape.gradient(loss_rec, vars_)
      optimizer.apply_gradients(zip(grad_rec, vars_))
    if it % 50 == 0:
      print('Epoch:', epoch, 'iter:', it, 'loss_rec:', loss_rec.numpy(), 'loss_adv:',
            loss_adv.numpy(), 'loss:', loss.numpy(), 'loss_d0:', loss_d0.numpy(), 'loss_d1:', loss_d1.numpy())

In [0]:
def strip_eos(sents):
  return [sent[:sent.index(sent_end)] if sent_end in sent else sent for sent in sents]

def greedy_decoding(enc_ip, dec_ip, weights):
  batch_size = enc_ip.shape[0]
  labels = [0.0] * batch_size
  labels = tf.reshape(labels, [-1, 1])
  hidden = tf.concat([m.dl2(labels), tf.zeros([batch_size, m.dim_z], dtype=tf.float32)], 1)
  m(enc_ip, dec_ip, None, labels, weights, hidden)
  ori = np.argmax(m.hard_logits_ori, axis=2).tolist()
  ori = [[idx_to_word[i] for i in sent] for sent in ori]
  tsf = np.argmax(m.hard_logits_tsf, axis=2).tolist()
  tsf = [[idx_to_word[i] for i in sent] for sent in tsf]
  strip_eos(ori)
  strip_eos(tsf)
  ori = [' '.join(sent) for sent in ori]
  tsf = [' '.join(sent) for sent in tsf]
  return ori, tsf

In [0]:
batch_enc_ip = tf.concat([data_seq['test'][0][:10], data_seq['test'][3][:10]], 0)
batch_dec_ip = tf.concat([data_seq['test'][1][:10], data_seq['test'][4][:10]], 0)
# batch_dec_op = tf.concat([data_seq['test'][2][:10], data_seq['test'][5][:10]], 0)
batch_weights = tf.concat([data_seq['test'][6][:10], data_seq['test'][7][:10]], 0)
ori, tsf = greedy_decoding(batch_enc_ip, batch_dec_ip, batch_weights)
print(ori)
print('-------------------------------------------------------------------------')
print(tsf)

**Pointer Model and Local Attention**

In [0]:
BUFFER_SIZE = len(data_seq['train'][0])
BATCH_SIZE = 128
steps_per_epoch = len(data_seq['train'][0])//BATCH_SIZE
embedding_dim = 192
units = 512

In [0]:
input_tensor_train = tf.convert_to_tensor(data_seq['train'][0])
target_tensor_train = tf.convert_to_tensor(data_seq['train'][1])
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [0]:
# Pointer Model
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, embeddings):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embeddings], trainable=False)
    # self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units, dropout=0.25,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='he_normal')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1) #(batch_size, 1, hidden_size)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis))) #(batch_size, maxlen+1, 1)
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1) #(batch_size, hidden_size)

    return context_vector, attention_weights

class PointerAttention(tf.keras.Model):
  def __init__(self, units):
    super(PointerAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units, activation='tanh')

  def call(self, h_prev, encoder_vals):
    units = h_prev.get_shape().as_list()[-1]
    batch_size = h_prev.get_shape().as_list()[0]
    sentinel = tf.random.uniform(shape=(batch_size, 1, units))
    encoder_vals_expanded = tf.concat([sentinel, encoder_vals], axis=1)
    query = self.W1(h_prev)
    h_att = tf.expand_dims(query, 1) # (batch_size, 1, units)
    out_att = tf.reduce_sum(tf.multiply(h_att, encoder_vals_expanded), axis=2) # (batch_size, max_length + 1)
    alpha = tf.nn.softmax(out_att) # (batch_size, max_length + 1)
    sentinel_weights = alpha[:, 0]
    alpha = alpha[:, 1:]
    context = tf.reduce_sum(encoder_vals * tf.expand_dims(alpha, 2), 1)   #(batch_size, units)
    return alpha, sentinel_weights, context

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, embeddings):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.vocab_size = vocab_size
    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embeddings], trainable=False)
    # self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, dropout=0.25,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='he_normal')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.fc_context = tf.keras.layers.Dense(vocab_size)

    # used for attention
    # self.attention = BahdanauAttention(self.dec_units)
    # pointer model
    self.attention = PointerAttention(self.dec_units)
    

  def call(self, x, hidden, enc_output, encoder_input):
  # def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    # for local attention
    # context_vector, attention_weights = self.attention(hidden, enc_output)
    encoder_length = tf.shape(enc_output)[1]
    batch_size = tf.shape(enc_output)[0]
    # for pointer attention
    alpha, sentinel_weights, context_vector = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))
    # output shape == (batch_size, vocab)
    x = self.fc(output) # without pointer model
    x = self.fc(output) + self.fc_context(context_vector)

    # For pointer model
    pred_softmax = tf.nn.softmax(x) #(batch_size, vocab_size)
    sentinel_weights = tf.expand_dims(sentinel_weights, 1) #(batch_size, 1)
    pred = pred_softmax * sentinel_weights #(batch_size, vocab_size)
    r = tf.expand_dims(tf.range(batch_size), 1) #(batch_size, 1)
    r = tf.tile(r, [1, encoder_length]) # (batch_size, encoder_length)
    r_concat = tf.stack([r, encoder_input], axis=2) # (batch_size, encoder_length, 2)
    r_concat_flattened = tf.reshape(r_concat, [-1, 2]) # (batch_size * encoder_length, 2)
    r_concat_flattened = tf.cast(r_concat_flattened, tf.int64)
    alpha_flattened = tf.reshape(alpha, [-1]) # (batch_size * encoder_length)
    dense_shape = np.array([batch_size, self.vocab_size], dtype=np.int64)
    pointer_probs = tf.SparseTensor(indices=r_concat_flattened, values=alpha_flattened, dense_shape=dense_shape)
    x = tf.sparse.add(pred, pointer_probs)

    # return x, state, attention_weights # for local attention
    return x, state, alpha, sentinel_weights

In [0]:
encoder = Encoder(max_vocab_size, embedding_dim, units, BATCH_SIZE, embeddings)
decoder = Decoder(max_vocab_size, embedding_dim, units, BATCH_SIZE, embeddings)

In [0]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

In [0]:
# @tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([word_to_idx[sent_start]] * BATCH_SIZE, 1)
    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      # predictions, dec_hidden, _, _ = decoder(dec_input, dec_hidden, enc_output, inp)
      loss += loss_function(targ[:, t], predictions)
      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)
  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))
  return batch_loss

In [0]:
EPOCHS = 25
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [0]:
checkpoint_dir = data_dir_shakespeare + 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)
checkpoint.save(file_prefix = checkpoint_prefix)

In [0]:
def evaluate(text):
  attention_plot = np.zeros((max_output_seq_length, max_input_seq_length))
  tmp = [word_to_idx[sent_start]]
  for token in text:
    if token not in word_to_idx:
      tmp.append(word_to_idx[unknown_word])
    else:
      tmp.append(word_to_idx[token])
  tmp.append(word_to_idx[sent_end])
  tmp = pad_sequences([tmp], max_input_seq_length, padding='pre', truncating='post')
  tmp = tf.convert_to_tensor(tmp)
  result = ''
  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(tmp, hidden)
  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([word_to_idx[sent_start]], 0)

  for t in range(max_output_seq_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
    # predictions, dec_hidden, attention_weights, sentinel_weights = decoder(dec_input, dec_hidden, enc_out, tmp)
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()
    sampled_word = idx_to_word[predicted_id]
    if predicted_id == word_to_idx[unknown_word]:
      predicted_id_ = np.argmax(attention_weights)
      sampled_word = idx_to_word[int(tmp[0][predicted_id_])]
      if sampled_word != unknown_word and sampled_word != sent_start and sampled_word != pad_word and sampled_word != sent_end:
        result += sampled_word + ' '
    elif sampled_word != sent_start and sampled_word != pad_word and sampled_word != sent_end:
      result += sampled_word + ' '
    if sampled_word == sent_end:
      return result, text, attention_plot
    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, text, attention_plot

In [0]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')
  fontdict = {'fontsize': 14}
  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
  plt.show()

In [0]:
def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)
  return result, attention_plot
#   print('Input: %s' % (sentence))
#   print('Predicted translation: {}'.format(result))
#   attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
#   plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [0]:
 # Post-processing
def post_process(input_file, output_file):
  input_file = open(input_file)
  input_lines = input_file.readlines()
  input_lines = [row.strip().lower().split(' ') for row in input_lines]
  sequences_output = []
  with open(output_file, 'w') as f:
    for i, line in enumerate(input_lines):
      output, _ = translate(line)
      sequences_output.append(output)
      print(line, '\n', output, '\n', idxseq_to_vocabseq(data_seq['test'][1][i]), '\n-----------------------\n')
      f.write(output + '\n')
  return sequences_output

In [0]:
test_out = post_process(data_dir_shakespeare + 'test.modern.nltktok', data_dir_shakespeare + 'gmodel1.test.out')

In [0]:
# test BLEU
! perl /gdrive/My\ Drive/Deep\ Learning\ Project/Shakespearizing-Modern-English/code/main/multi-bleu.perl -lc /gdrive/My\ Drive/Deep\ Learning\ Project/Shakespearizing-Modern-English/data/test.original.nltktok < /gdrive/My\ Drive/Deep\ Learning\ Project/Shakespearizing-Modern-English/data/gmodel1.test.out

**Seq2Seq With Global Attention**

In [0]:
# Baseline Model with teacher forcing
encoder_ip = Input(shape=(None, ))
embed = Embedding(input_dim=max_vocab_size, output_dim=192, weights=[embeddings], trainable=False)
encoder_embed = embed(encoder_ip)
encoder_lstm = Bidirectional(LSTM(units=512, return_sequences=True, recurrent_dropout=0.25, dropout=0.25,
                        kernel_initializer='he_normal', recurrent_activation='tanh', return_state=True))
encoder, state_h_f, state_c_f, state_h_b, state_c_b = encoder_lstm(encoder_embed)
encoder_states = [Concatenate()([state_h_f, state_h_b]), Concatenate()([state_c_f, state_c_b])]
encoder = BatchNormalization()(encoder)

decoder_ip = Input(shape=(None, ))
decoder_embed = embed(decoder_ip)
decoder_lstm = LSTM(units=1024, recurrent_dropout=0.25, dropout=0.25, return_sequences=True,
                        kernel_initializer='he_normal',recurrent_activation='tanh', return_state=True)
decoder, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)
attention_dot = dot([decoder, encoder], axes=[2, 2])
attention_softmax = Activation('softmax')
attention = attention_softmax(attention_dot)
context = dot([attention, encoder], axes=[2,1])
decoder_combined_context = concatenate([context, decoder])
decoder_combined_context = BatchNormalization()(decoder_combined_context)
output_layer = Dense(max_vocab_size, activation='softmax')
output = output_layer(decoder_combined_context)
model_final = tf.keras.Model([encoder_ip, decoder_ip], output)
# model_final.summary()

In [0]:
# Inference model
encoder_model = tf.keras.Model(encoder_ip, [encoder] + encoder_states)
decoder_state_input_h = Input(shape=(1024,))
decoder_state_input_c = Input(shape=(1024,))
encoder_op = Input(shape=(None, 1024))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_op, d_s_h, d_s_c = decoder_lstm(embed(decoder_ip), initial_state=decoder_states_inputs)
decoder_states = [d_s_h, d_s_c]
inf_attention = attention_softmax(dot([decoder_op, encoder_op], axes=[2, 2]))
inf_context = dot([inf_attention, encoder_op], axes=[2,1])
decoder_op = concatenate([inf_context, decoder_op])
decoder_op = output_layer(BatchNormalization()(decoder_op))
decoder_model = tf.keras.Model(
    [decoder_ip] + [encoder_op] + decoder_states_inputs,
    [decoder_op] + decoder_states + [inf_attention])
# decoder_model.summary()

In [0]:
model_final.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model_final.fit([data['train'][0], data['train'][1]], data['train'][2],
          batch_size=32,
          epochs=10, validation_data=([data['valid'][0], data['valid'][1]], data['valid'][2]))

In [0]:
def decode_sequence(input_seq):
  encoder_op, e_h, e_c = encoder_model.predict(input_seq)
  states_value = [e_h, e_c]
  target_seq = np.zeros((1, 1), dtype=np.int32)
  target_seq[0][0] = word_to_idx[sent_start]
  stop_condition = False
  decoded_sentence = []
  cnt = 0
  att_all = []
  while stop_condition is not True:
    output_tokens, h, c, att = decoder_model.predict([target_seq] + [encoder_op] + states_value)
    att_all.append(att)
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_word = idx_to_word[sampled_token_index]
    if cnt >= max_output_seq_length or len(decoded_sentence) >= max_output_seq_length or sampled_word == sent_end or sampled_word == pad_word:
      stop_condition = True
    elif sampled_word == unknown_word:
      cnt += 1
      sampled_token_index = np.argmax(att, axis=2)[0][0]
      sampled_word = idx_to_word[input_seq[0][sampled_token_index]]
      if sampled_word == pad_word or sampled_word == sent_end:
        stop_condition = True
      elif sampled_word != sent_start and sampled_word != unknown_word and sampled_word != "\"de":
        decoded_sentence.append(sampled_word)
    else:
      cnt += 1
      if sampled_word != "\"de":
        decoded_sentence.append(sampled_word)
    target_seq = np.zeros((1, 1), dtype=np.int32)
    target_seq[0][0] = sampled_token_index
    states_value = [h, c]
  return ' '.join(decoded_sentence), att_all

In [0]:
 # Post-processing
def post_process(input_file, output_file):
  input_file = open(input_file)
  input_lines = input_file.readlines()
  input_lines = [row.strip().lower().split(' ') for row in input_lines]
  sequences_input = []
  for text in input_lines:
    tmp = [word_to_idx[sent_start]]
    for token in text:
      if token not in word_to_idx:
        tmp.append(word_to_idx[unknown_word])
      else:
        tmp.append(word_to_idx[token])
    tmp.append(word_to_idx[sent_end])
    sequences_input.append(tmp)
  sequences_input = pad_sequences(sequences_input, max_input_seq_length, padding='pre', truncating='post')
  print('Printing sample sequences:')
  print(sequences_input[0], ':', idxseq_to_vocabseq(sequences_input[0]))
  print(sequences_input[1], ':', idxseq_to_vocabseq(sequences_input[1]))
  sequences_output = []
  with open(output_file, 'w') as f:
    for i, line in enumerate(sequences_input):
      output, _ = decode_sequence(np.array([line], dtype=np.int32))
      # print(output)
      sequences_output.append(output)
      f.write(output + '\n')
  return sequences_output

In [0]:
test_out = post_process(data_dir_shakespeare + 'test.modern.nltktok', data_dir_shakespeare + 'm1.test.out')

In [0]:
# test set BLEU
! perl /gdrive/My\ Drive/Deep\ Learning\ Project/Shakespearizing-Modern-English/code/main/multi-bleu.perl -lc /gdrive/My\ Drive/Deep\ Learning\ Project/Shakespearizing-Modern-English/data/test.original.nltktok < /gdrive/My\ Drive/Deep\ Learning\ Project/Shakespearizing-Modern-English/data/m1.test.out

**GAN Model #2**

In [0]:
# GAN model
encoder_ip = Input(shape=(None, ))
embed = Embedding(input_dim=max_vocab_size, output_dim=192, weights=[embeddings], trainable=False)
encoder_embed = embed(encoder_ip)
encoder_lstm = LSTM(units=256, return_sequences=True, recurrent_dropout=0.2, dropout=0.2,
                        kernel_initializer='he_normal', recurrent_activation='tanh', return_state=True)
encoder, state_h, state_c = encoder_lstm(encoder_embed)
encoder_states = [state_h, state_c]
encoder = BatchNormalization()(encoder)

decoder_ip = Input(shape=(None, ))
decoder_embed = embed(decoder_ip)
decoder_lstm = LSTM(units=256, recurrent_dropout=0.2, dropout=0.2, return_sequences=True,
                        kernel_initializer='he_normal',recurrent_activation='tanh', return_state=True)
decoder, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)

attention_dot = dot([decoder, encoder], axes=[2, 2])
attention_softmax = Activation('softmax')
attention = attention_softmax(attention_dot)
context = dot([attention, encoder], axes=[2,1])
decoder_combined_context = concatenate([context, decoder])
decoder_combined_context = BatchNormalization()(decoder_combined_context)

output_layer = Dense(max_vocab_size, activation='softmax')
output = output_layer(decoder_combined_context)
generator = tf.keras.Model([encoder_ip, decoder_ip], output)
# takes english sentence as input, gives shakespeare translation
generator.summary()

In [0]:
discriminator_ip = Input(shape=(24, max_vocab_size))
discriminator_dense = Dense(512, activation='relu')
discriminator_output_layer = Dense(1, activation='sigmoid')
discriminator_output = discriminator_output_layer(Flatten()(discriminator_dense(discriminator_ip)))
discriminator = tf.keras.Model(discriminator_ip, discriminator_output)
# takes english/shakespeare sentence, gives 0/1
discriminator.summary()
discriminator.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

In [0]:
# For the combined model we will only train the generator
# discriminator.trainable = False
gen_op = generator([encoder_ip, decoder_ip])
valid = discriminator_output_layer(Flatten()(discriminator_dense(gen_op)))
combined = tf.keras.Model([encoder_ip, decoder_ip], valid)
combined.layers[3].trainable = False
combined.layers[4].trainable = False
combined.layers[5].trainable = False
opt = tf.keras.optimizers.Adam(lr=0.02)
combined.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
# takes english sentence, generates shakespeare sentence, determines validity
combined.summary()

In [0]:
# train GAN model
n_epochs = 20
batch_size = 32
n_iters = int((n_epochs * len(data['train'][0]) / batch_size))
for it in range(n_iters):
  shuffled_idx = np.arange(len(data['train'][0]))
  np.random.shuffle(shuffled_idx)
  batch_idx = shuffled_idx[it*batch_size:(it+1)*batch_size]
  batch_enc_ip = tf.convert_to_tensor(data['train'][0][batch_idx])
  batch_dec_ip = tf.convert_to_tensor(data['train'][1][batch_idx])
  # train discriminator
  # gives loss and acc
  half_batch = int(len(batch_idx)/2)
  random_hb = np.random.randint(0, len(shuffled_idx), half_batch)
  batch_random_enc_ip = tf.convert_to_tensor(data['train'][0][random_hb])
  batch_random_dec_ip = tf.convert_to_tensor(data['train'][1][random_hb])
  batch_random_dec_op = tf.keras.utils.to_categorical(data['train'][2][random_hb], num_classes=max_vocab_size)
  gen_out = generator([batch_random_enc_ip, batch_random_dec_ip])
  loss_real = discriminator.train_on_batch(batch_random_dec_op, tf.ones(half_batch, 1))
  loss_fake = discriminator.train_on_batch(gen_out, tf.zeros(half_batch, 1))
  loss = (loss_real[0] + loss_fake[0]) / 2
  # train generator
  combined.layers[3].set_weights(discriminator_dense.get_weights())  
  combined.layers[5].set_weights(discriminator_output_layer.get_weights()) 
  g_loss = combined.train_on_batch([batch_enc_ip, batch_dec_ip], tf.ones((len(batch_idx)), 1))
  # if it % 10 == 0:
  print('iter:', it, ', loss:', loss, 'g_loss:', g_loss[0], ', loss_real:', loss_real[0], ', loss_fake:', loss_fake[0])

**Seq2Seq Global Attention and Pointer Model with Original Repo Code**

Change runtime to Python 2.0 and Tensorflow 1.0 for this.

In [0]:
os.chdir(main_dir + shakespeare_dir + '/code/main')
os.mkdir('tmp')

In [0]:
! python mt_main.py preprocessing

In [0]:
# pointer model
! python mt_main.py train 5 pointer_model

In [0]:
# seq2seq with global attention
! python mt_main.py train 5 seq2seq

In [0]:
# inference
# use saved model
! python mt_main.py test tmp/global_att_256/seq2seq5.ckpt greedy

**Final Output Pipeline**

In [0]:
model_final = tf.keras.models.load_model(data_dir_shakespeare + 'models/model_final_2')
encoder_model = tf.keras.models.load_model(data_dir_shakespeare + 'models/encoder_model_2')
decoder_model = tf.keras.models.load_model(data_dir_shakespeare + 'models/decoder_model_2')

In [0]:
os.chdir(main_dir + img2poem_dir + 'code/src')

In [0]:
# generate output poem for an image
!python test.py "../images/test.jpg"

In [0]:
from IPython.display import Image
Image(filename='../images/test.jpg')

In [0]:
# display English peom
!cat test.jpg_poem.txt

In [0]:
# text style transfer on English poem
sample_out = post_process(main_dir + '/img2poem/code/src/' + 'test.jpg_poem.txt', main_dir + '/img2poem/code/src/' + 'test.poem.out')

In [0]:
# display Shakespearean prose
sample_out