In [1]:
!pip install segtok



In [10]:
# Getting access to the dataset and the Python files on Google Drive.
# You will probably have to give permission.

from google.colab import drive
drive.mount('/content/gdrive')
root_folder = "/content/gdrive/My Drive/Project-Lion/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
from segtok import tokenizer
from collections import Counter
import tensorflow as tf
import numpy as np
import json
import os, sys
import pandas as pd
from sklearn.model_selection import train_test_split
import string

sys.path.insert(0, "/content/gdrive/My Drive/Project-Lion/") # This enables us to import Python libraries in the folder.


root_folder = "/content/gdrive/My Drive/Project-Lion/"

Preprocessing

In [0]:
def numerize_sequence(tokenized):
    return [w2i.get(w, unkI) for w in tokenized]
def pad_sequence(numerized, pad_index, to_length):
    pad = numerized[:to_length]
    padded = pad + [pad_index] * (to_length - len(pad))
    mask = [w != pad_index for w in padded]
    return padded, mask

In [0]:
dataset_df = pd.read_csv(root_folder + 'data/kaggle/shortjokes.csv')

In [0]:
def clean_data(lst, in_place=True, keep_punc=""):
    if in_place:
        new_lst = lst
    else:
        new_lst = lst.copy()
        
    for i in range(len(new_lst)):
        joke = lst[i]
        joke = joke.replace("\r", " ").replace("\n", " ").replace("/", " or ")
        joke = joke.translate(str.maketrans("", "", string.punctuation.replace(keep_punc, ""))) # Remove punctuation
        joke = joke.lower() # Lowercase
        joke = ''.join(char for char in joke if not char.isdigit()) # Remove numbers
        joke = joke.strip() # Remove leading and ending whitespace
        new_lst[i] = joke
    return new_lst

In [0]:
dataset_df = dataset_df.loc[dataset_df["Joke"] != ""]
#dataset_df = dataset_df[["Joke"]]
#dataset_df = clean_data(dataset_df["Joke"].tolist(), keep_punc="?")
dataset = dataset_df.to_dict('records')

In [22]:
# You do not need to run this
# This is to show you how the dataset was created
# You should read to understand, so you can preprocess text
# In the same way, in the evaluation section

input_length = 0

print(type(dataset))
for a in dataset:
    tokenized_joke = tokenizer.word_tokenizer(a['Joke'].lower())
    input_length = max(input_length, len(tokenized_joke))
    a['tokenized'] = tokenized_joke
print(input_length)

<class 'list'>
96


In [23]:
# You do not need to run this
# This is to show you how the dataset was created
# You should read to understand, so you can preprocess text
# In the same way, in the evaluation section

word_counts = Counter()
for a in dataset:
    word_counts.update(a['tokenized'])

print(word_counts.most_common(30))

[('a', 170819), ('the', 154009), ('.', 137049), ('?', 108416), ('i', 97247), (',', 91799), ('to', 87971), ('you', 79574), ('"', 78575), ('and', 59807), ('in', 51550), ('of', 49315), ('my', 48889), ('what', 45030), (':', 44404), ('is', 43020), ('it', 40819), ('do', 36100), ('...', 34400), ('me', 31167), ('!', 28826), ('on', 28000), ('was', 27008), ('for', 26258), ('that', 25307), ('with', 24181), ('have', 23592), ('why', 22874), ('he', 22618), ('your', 20628)]


In [24]:
len(word_counts)

89800

In [27]:
# You do not need to run this
# This is to show you how the dataset was created
# You should read to understand, so you can preprocess text
# In the same way, in the evaluation section

# Creating the vocab
vocab_size = len(word_counts)
special_words = ["<START>", "UNK", "PAD"]
vocabulary = special_words + [w for w, c in word_counts.most_common(vocab_size-len(special_words))]
w2i = {w: i for i, w in enumerate(vocabulary)}

# Numerizing and padding
unkI, padI, startI = w2i['UNK'], w2i['PAD'], w2i['<START>']

for a in dataset:
    a['numerized'] = numerize_sequence(a['tokenized']) # Change words to IDs
    a['numerized'], a['mask'] = pad_sequence(a['numerized'], padI, input_length) # Append appropriate PAD tokens
    
# Compute fraction of words that are UNK:
word_counters = Counter([w for a in dataset for w in a['Joke'] if w != padI])

print("Fraction of UNK words:", float(word_counters[unkI]) / sum(word_counters.values()))

Fraction of UNK words: 0.0


In [28]:
vocab_size = len(vocabulary)
input_length = len(dataset[0]['numerized']) # The length of the first element in the dataset, they are all of the same length

d_train, d_valid = train_test_split(dataset, test_size=0.01, random_state=42)

print("Number of training samples:",len(d_train))
print("Number of validation samples:",len(d_valid))

Number of training samples: 229340
Number of validation samples: 2317


In [29]:
def numerized2text(numerized):
    """ Converts an integer sequence in the vocabulary into a string corresponding to the title.
    
        Arguments:
            numerized: List[int]  -- The list of vocabulary indices corresponding to the string
        Returns:
            title: str -- The string corresponding to the numerized input, without padding.
    """
    #####
    # BEGIN YOUR CODE HERE 
    # Recover each word from the vocabulary in the list of indices in numerized, using the vocabulary variable
    # Hint: Use the string.join() function to reconstruct a single string
    #####
    
    words = [vocabulary[int(num)] for num in numerized]
    converted_string = ' '.join(words)
    
    #####
    # END YOUR CODE HERE
    #####
    
    return converted_string

entry = d_train[1001]
print("Reversing the numerized: "+numerized2text(entry['numerized']))
print("From the `title` entry: "+ entry['Joke'])

Reversing the numerized: an old man shuffled slowly into an ice cream parlor. he ordered a banana split. the waitress asked , crushed nuts ? no , he said. arthritis . PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD PAD
From the `title` entry: An old man shuffled slowly into an ice cream parlor. He ordered a banana split. The waitress asked, Crushed nuts? No, he said. Arthritis.


In [0]:
def build_batch(dataset, batch_size):
    """ Builds a batch of source and target elements from the dataset.
    
        Arguments:
            dataset: List[db_element] -- A list of dataset elements
            batch_size: int -- The size of the batch that should be created
        Returns:
            batch_input: List[List[int]] -- List of source sequences
            batch_target: List[List[int]] -- List of target sequences
            batch_target_mask: List[List[int]] -- List of target batch masks
    """
    
    #####
    # BEGIN YOUR CODE HERE 
    #####
    
    
    # We get a list of indices we will choose from the dataset.
    # The randint function uses a uniform distribution, giving equal probably to any entry
    # for each batch
    indices = list(np.random.randint(0, len(dataset), size=batch_size))
    
    # Recover what the entries for the batch are
    batch = [dataset[i] for i in indices]
    
    # Get the raw numerized for this input, each element of the dataset has a 'numerized' key
    batch_numerized = np.asarray([db_element["numerized"] for db_element in batch])

    # Create an array of start_index that will be concatenated at position 1 for the input.
    # Should be of shape (batch_size, 1)
    start_tokens = np.zeros((batch_size, 1))

    # Concatenate the start_tokens with the rest of the input
    # The np.concatenate function should be useful
    # The output should now be [batch_size, sequence_length+1]
    batch_input = np.concatenate((start_tokens, batch_numerized), axis=1)

    # Remove the last word from each element in the batch
    # To restore the [batch_size, sequence_length] size
    batch_input = batch_input[:, :-1]
    
    # The target should be the un-shifted numerized input
    batch_target = batch_numerized

    # The target-mask is a 0 or 1 filter to note which tokens are
    # padding or not, to give the loss, so the model doesn't get rewarded for
    # predicting PAD tokens.
    batch_target_mask = np.array([a['mask'] for a in batch])
    
    #####
    # END YOUR CODE HERE 
    #####
        
    return batch_input, batch_target, batch_target_mask

In [0]:
# Using a basic RNN/LSTM for Language modeling
class LanguageModel():
    def __init__(self, input_length, vocab_size, rnn_size, learning_rate=1e-4):
        
        # Create the placeholders for the inputs:
        # All three placeholders should be of size [None, input_length]
        # Where None represents a variable batch_size, and input_length is the
        # maximal length of a sequence of words, after being padded.
        self.input_num = tf.placeholder(tf.int32, shape=[None, input_length])
        
        # my code
        self.targets = tf.placeholder(tf.int32, shape=[None, input_length])
        self.targets_mask = tf.placeholder(tf.bool, shape=[None, input_length])

        # Create an embedding variable of shape [vocab_size, rnn_size]
        # That will map each word in our vocab into a vector of rnn_size size.
        embedding = tf.Variable(tf.random_uniform([vocab_size, rnn_size], -1.0, 1.0))
        # Use the tensorflow embedding_lookup function
        # To embed the input_num, using the embedding variable we've created
        input_emb = tf.nn.embedding_lookup(embedding, self.input_num)

        # Create a an RNN or LSTM cell of rnn_size size.
        # Look into the tf.nn.rnn_cell documentation
        # You can optionally use Tensorflow Add-ons such as the MultiRNNCell, or the DropoutWrapper
        lm_cell = tf.nn.rnn_cell.LSTMCell(rnn_size)
        
        # add another cell for MultiRNNCell
        # hidden_units = [rnn_size] * 10 
        # hidden_cells = [tf.nn.rnn_cell.LSTMCell(num_units=n) for n in hidden_units]
        # hidden_cells.append(lm_cell)
        # stacked_rnn_cell = MultiRNNCell(hidden_cells)
        
        # Use the dynamic_rnn function of Tensorflow to run the embedded inputs
        # using the lm_cell you've created, and obtain the outputs of the RNN cell.
        # You have created a cell, which represents a single block (column) of the RNN.
        # dynamic_rnn will "copy" the cell for each element in your sequence, runs the input you provide through the cell,
        # and returns the outputs and the states of the cell.
        outputs, states = tf.nn.dynamic_rnn(lm_cell, input_emb, dtype=tf.float32)

        # Use a dense layer to project the outputs of the RNN cell into the size of the
        # vocabulary (vocab_size).
        # output_logits should be of shape [None,input_length,vocab_size]
        # You can look at the tf.layers.dense function
        self.output_logits = tf.layers.dense(inputs=outputs, units=vocab_size)

        # Setup the loss: using the sparse_softmax_cross_entropy.
        # The logits are the output_logits we've computed.
        # The targets are the gold labels we are trying to match
        # Don't forget to use the targets_mask we have, so your loss is not off,
        # And your model doesn't get rewarded for predicting PAD tokens
        # You might have to cast the masks into float32. Look at the tf.cast function.
        weights = tf.cast(self.targets_mask, tf.float32)
        self.loss = tf.losses.sparse_softmax_cross_entropy(labels=self.targets,logits=self.output_logits, weights=weights)

        # Setup an optimizer (SGD, RMSProp, Adam), you can find a list under tf.train.*
        # And provide it with a start learning rate.

        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, name='Adam')     

        # We create a train_op that requires the optimizer we've created to minimize the
        # loss we've defined.
        # look for the optimizer.minimize function, define what should be miniminzed.
        # You can provide it with the provide an optional global_step parameter as well that keeps of how many
        # Optimizations steps have been run.
        
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)
        self.saver = tf.train.Saver()

In [32]:
# We can create our model,
# with parameters of our choosing.

tf.reset_default_graph() # This is so that when you debug, you reset the graph each time you run this, in essence, cleaning the board
model = LanguageModel(input_length=input_length, vocab_size=vocab_size, rnn_size=256*2, learning_rate=1e-4)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


In [39]:
# DO NOT RUN THIS BLOCK IF YOU DON"T WANT TO TRAIN THE NETWORK

# Skeleton code
# You have to write your own training process to obtain a
# Good performing model on the validation set, and save it.

experiment = root_folder+"models/final_language_model"

with tf.Session() as sess:
    # Here is how you initialize weights of the model according to their
    # Initialization parameters.
    sess.run(tf.global_variables_initializer())
    
    # Here is how you restore the weights previously saved
    model.saver.restore(sess, experiment)
    
    epoch = 3
    batch_size = 64
    num_iter = epoch * len(d_train) // batch_size
    print("Total number of iterations is: " + str(num_iter))
    
    eval_input, eval_target, eval_target_mask = build_batch(d_valid, 50)
    feed = {model.input_num: eval_input, model.targets: eval_target, model.targets_mask: eval_target_mask}
    eval_loss = sess.run(model.loss, feed_dict=feed)
    print("Evaluation set loss: ", eval_loss)
        
    for i in range(num_iter):
        # Here is how you obtain a batch:
        batch_input, batch_target, batch_target_mask = build_batch(d_train, batch_size)
        # Map the values to each tensor in a `feed_dict`
        feed = {model.input_num: batch_input, model.targets: batch_target, model.targets_mask: batch_target_mask}

        # Obtain a single value of the loss for that batch.
        # !IMPORTANT! Don't forget to include the train_op to when using a batch from the training dataset
        # (d_train)
        # !MORE IMPORTANT! Don't use the train_op if you evaluate the loss on the validation set,
        # Otherwise, your network will overfit on your validation dataset.

        step, train_loss, _ = sess.run([model.global_step, model.loss, model.train_op], feed_dict=feed)

        if i % 20 == 0:
          print("step: " + str(i))
        if i % 100 == 0:
            print("step: " + str(i))
            print("train_loss: " + str(train_loss))
            eval_input, eval_target, eval_target_mask = build_batch(d_valid, 50)
            feed = {model.input_num: eval_input, model.targets: eval_target, model.targets_mask: eval_target_mask}
            eval_loss_steps = sess.run(model.loss, feed_dict=feed)
            # if (eval_loss_steps < eval_loss):
            #  print("eval_loss decreases!")
            eval_loss = eval_loss_steps
            print("Evaluation set loss: ", eval_loss)
            print("saving model weights ....")
            model.saver.save(sess, experiment)
            print("saving model weights completed ....")
            # else:
            #  print("eval_loss didn't decrease.")
            #  print("half learning rate, make another model, reset to previous checkpoint")
            #  # learning_rate /= 2
            #  # model = LanguageModel(input_length=input_length, vocab_size=vocab_size, rnn_size=256*4, learning_rate=learning_rate)
            #  model.saver.restore(sess, experiment)
    
    # Here is how you save the model weights
    model.saver.save(sess, experiment)
    
    # Here is how you restore the weights previously saved
    model.saver.restore(sess, experiment)

INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/Project-Lion/models/final_language_model
Total number of iterations is: 10750
Evaluation set loss:  5.1346316
step: 0
step: 0
train_loss: 4.844065
Evaluation set loss:  4.7709723
saving model weights ....
saving model weights completed ....
step: 20
step: 40
step: 60
step: 80
step: 100
step: 100
train_loss: 4.7974916
Evaluation set loss:  4.790634
saving model weights ....
saving model weights completed ....
step: 120
step: 140
step: 160
step: 180
step: 200
step: 200
train_loss: 4.541914
Evaluation set loss:  5.088226
saving model weights ....
saving model weights completed ....
step: 220
step: 240
step: 260
step: 280
step: 300
step: 300
train_loss: 4.7916493
Evaluation set loss:  5.2291493
saving model weights ....
saving model weights completed ....
step: 320
step: 340
step: 360
step: 380
step: 400
step: 400
train_loss: 4.50705
Evaluation set loss:  4.878726
saving model weights ....
saving model weights completed ...

In [0]:
df = pd.read_csv(root_folder + 'data/kaggle/shortjokes.csv')
df = df.loc[df["Joke"] != ""]
data = clean_data(df["Joke"].tolist(), keep_punc="?")
qa_all = [[s.strip() for s in list(filter(None, joke.split("?")))] for joke in data if len(list(filter(None, joke.split("?")))) == 2]
min_qlen, max_qlen = 3, 15
min_alen, max_alen = 3, 15
qa = [(q, a) for q, a in qa_all if len(q.split()) >= min_qlen and len(q.split()) <= max_qlen and len(a.split()) >= min_alen and len(a.split()) <= max_alen]
questions = [q for q, a in qa]

In [71]:
questions_500 = questions[:500]
questions_500[:5]

['why cant barbie get pregnant',
 'why was the musician arrested',
 'did you hear about the guy who blew his entire lottery winnings on a limousine',
 'what do you do if a bird shits on your car',
 'what should you do before criticizing pacman']

In [72]:
model_file = root_folder+"models/final_language_model"

# Saving generated text to a file
with tf.Session() as sess:
    model.saver.restore(sess, model_file)

    # Here are some headline starters.
    # They're all about tech companies, because
    # That is what is in our dataset
    headline_starters = ["do you know", "how", "knock knock", "why did the chicken", "universe", "college"]
    
    with open(root_folder + "LM_output_jokes.txt", "w") as text_file:
      for headline_starter in questions_500:
          #print("===================")
          #print("Generating Jokes starting with: "+headline_starter)
          print(headline_starter, file = text_file)

          # Tokenize and numerize the headline. Put the numerized headline
          # beginning in `current_build`
          tokenized = tokenizer.word_tokenizer(headline_starter)
          current_build = [startI] + numerize_sequence(tokenized)
          
          i = 0
          while len(current_build) < input_length:
              # Pad the current_build into a input_length vector.
              # We do this so that it can be processed by our LanguageModel class
              current_padded = current_build[:input_length] + [padI] * (input_length - len(current_build))
              current_padded = np.array([current_padded])

              # Obtain the logits for the current padded sequence
              # This involves obtaining the output_logits from our model,
              # and not the loss like we have done so far
              feed = {model.input_num: current_padded}
              logits = sess.run(model.output_logits, feed_dict=feed)

              # Obtain the row of logits that interest us, the logits for the last non-pad
              # inputs
              last_index = len(current_build) - 1
              last_logits = logits[0][last_index]

              # Find the highest scoring word in the last_logits
              # array. The np.argmax function should be useful.
              # Append this word to our current build
              current_build.append(np.argmax(last_logits))
              if vocabulary[int(np.argmax(last_logits))] != '.':
                i += 1

          # Go from the current_build of word_indices
          # To the headline (string) produced. This should involve
          # the vocabulary, and a string merger.
          produced_sentence = numerized2text(current_build)
          
          question = headline_starter.split(" ")
          produced_sentence = produced_sentence.split(" ")
          text_generated = produced_sentence[(len(question) + 2):len(question) + 1 + i]
          sentence = ' '.join(text_generated)
          print(sentence, file = text_file)
          print("", file = text_file)

INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/Project-Lion/models/final_language_model


In [44]:
model_file = root_folder+"models/final_language_model"

with tf.Session() as sess:
    model.saver.restore(sess, model_file)

    # Here are some headline starters.
    # They're all about tech companies, because
    # That is what is in our dataset
    headline_starters = ["do you know", "how", "knock knock", "why did the chicken", "universe", "college"]
    
    for headline_starter in questions_500:
        print("===================")
        print("Generating Jokes starting with: "+headline_starter)
        print(headline_starter)

        # Tokenize and numerize the headline. Put the numerized headline
        # beginning in `current_build`
        tokenized = tokenizer.word_tokenizer(headline_starter)
        current_build = [startI] + numerize_sequence(tokenized)

        while len(current_build) < input_length:
            # Pad the current_build into a input_length vector.
            # We do this so that it can be processed by our LanguageModel class
            current_padded = current_build[:input_length] + [padI] * (input_length - len(current_build))
            current_padded = np.array([current_padded])

            # Obtain the logits for the current padded sequence
            # This involves obtaining the output_logits from our model,
            # and not the loss like we have done so far
            feed = {model.input_num: current_padded}
            logits = sess.run(model.output_logits, feed_dict=feed)

            # Obtain the row of logits that interest us, the logits for the last non-pad
            # inputs
            last_index = len(current_build) - 1
            last_logits = logits[0][last_index]

            # Find the highest scoring word in the last_logits
            # array. The np.argmax function should be useful.
            # Append this word to our current build
            current_build.append(np.argmax(last_logits))

        # Go from the current_build of word_indices
        # To the headline (string) produced. This should involve
        # the vocabulary, and a string merger.
        produced_sentence = numerized2text(current_build)
        #print("\n", file = text_file)
        
        print(produced_sentence)
        print("")

INFO:tensorflow:Restoring parameters from /content/gdrive/My Drive/Project-Lion/models/final_language_model
Generating Jokes starting with: why cant barbie get pregnant
why cant barbie get pregnant
<START> why cant barbie get pregnant ? because they can't even . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

Generating Jokes starting with: why was the musician arrested
why was the musician arrested
<START> why was the musician arrested ? he was a fungi . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

Generating Jokes starting with: did you hear about the guy who blew his entire lottery winnings on a limousine
did you hear about the guy who blew his entire lottery winnings on a limousine
<START> did you hear about the guy who blew his entire lotte