<a href="https://colab.research.google.com/github/royRLL/Zen-NLG-using-Tensorflow-and-Nucleus-Sampling/blob/main/ProjectCharacterPredictionHP60.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import keras
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import string
import os
from google.colab import drive,files
import random
import nltk
import statistics

# mounting google drive
# drive.mount('/content/drive')

#uploading the dataset
uploaded = files.upload()


Saving zenData.txt to zenData.txt


The code now works on zen books:


*   Chödrön Pema-Start Where You Are A Guide to Compassionate Living
*   Gunaratana Bhante- Henepola Beyond Mindfulness in Plain English
*   Hanh Thich Nhat- The Miracle of Mindfulness
*   Katagiri Dainin- Each Moment Is the Universe Zen and the Way of Being Time
*   Dalai Lama- The Art of Happiness
*   Suzuki Shunryu- Zen Mind Beginner's Mind








This code block includes the methods needed for the training and nucleus sampling.




In [None]:
#splitting the input, without last letter and without first
def splitInput(chunk):
    inputText = chunk[:-1]
    target = chunk[1:]
    return inputText, target

  

#top-p sampling, based on 'THE CURIOUS CASE OF NEURAL TEXT DEGENERATION' (Holtzman et al 2020)
def topPSampling(p,logits):
  sortedLogits = tf.sort(logits,direction='DESCENDING')
  #Softmax to get the probabilities
  sortedProbs = tf.nn.softmax(sortedLogits)

  #cumulative sum of the probabilities
  probsSum = tf.cumsum(sortedProbs, axis=1, exclusive=True)
  maskedLogits = tf.where(probsSum < p, sortedLogits, tf.ones_like(sortedLogits)*1000)  
  minLogits= tf.reduce_min(maskedLogits, axis=1, keepdims=True)  
  return tf.where(
      logits < minLogits,
      tf.ones_like(logits, dtype=logits.dtype) * -1e10,
      logits,
  )

# builds the model, embedding->LSTM->dropout->LSTM->dropout->LSTM->dropout->dense

def buildModel(vocabLen, embeddingSize, lstmSize, batchSize):
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(vocabLen, embeddingSize,
                            batch_input_shape=[batchSize, None]))
  model.add(tf.keras.layers.LSTM(lstmSize,return_sequences=True,stateful=True))
  model.add(tf.keras.layers.Dropout(0.6))
  model.add(tf.keras.layers.LSTM(lstmSize,return_sequences=True,stateful=True))
  model.add(tf.keras.layers.Dropout(0.6))
  model.add(tf.keras.layers.LSTM(lstmSize,return_sequences=True,stateful=True))
  model.add(tf.keras.layers.Dropout(0.6))
  model.add(tf.keras.layers.Dense(vocabLen))
  return model


#loss function for the model
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

#placeholder function, used if the model is already trained.
def placeholdeR():
    text = open('zenData.txt','rb').read().decode('utf-8')
    vocab = sorted(set(text))
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    return char2idx,idx2char

In [None]:
def createAndTrain(embeddingSize,lstmSize):
    text = open('zenData.txt','rb').read().decode('utf-8')

    #vocabulary based on the dataset
    vocab = sorted(set(text))

    # Vectorizing the text
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    textInt = np.array([char2idx[c] for c in text])

    #determining sequence length
    seqLength = 100
    char_dataset = tf.data.Dataset.from_tensor_slices(textInt)

    sequences = char_dataset.batch(seqLength + 1, drop_remainder=True)
    dataset = sequences.map(splitInput)

    # Batch size
    batchSize = 64

    #Buffer size for the dataset shuffling in order to not keep all the dataset in memory
    bufferSize = 10000

    dataset = dataset.shuffle(bufferSize).batch(batchSize, drop_remainder=True)

    # Length of the vocabulary in chars
    vocabLen = len(vocab)


    model = buildModel(vocabLen=len(vocab), embeddingSize=embeddingSize,
        lstmSize=lstmSize,
        batchSize=batchSize)
    model.summary()


    model.compile(optimizer='adam',loss=loss)
    
    #checkpoint directory
    checkDirectory = './drive/My Drive/zenModel'
    

    # Name of the checkpoint files
    checkpoint_prefix = os.path.join(checkDirectory, "ckpt_{epoch}")

    #intializing callbacks to save the weights with the given names
    callbacks = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True)
    
    # change here to change the number of epochs, currently trained on 50.
    EPOCHS = 50

    history = model.fit(dataset, epochs=EPOCHS, callbacks=[callbacks])

    return char2idx,idx2char



# method to generate the text.
# Params: string prompt- prompt from which to start the generation
# int genLen- length of the generation
# temp - used for sampling with varying temprature, normalized between 1-0.
# char2idx - characters to ids, given from either the placeholder or the training method
# idx2char - ids back to characters, also given from the placeholder or the training method
def generate_text(prompt,genLen,temp,char2idx,idx2char):

  #Need to change here if changing model architecture also!
  model = buildModel(97, 256, 1024, batchSize=1)

  #loading the latest weights, and buildign the model for batch size 1
  model.load_weights(tf.train.latest_checkpoint('./drive/My Drive/zenModel'))
  model.build(tf.TensorShape([1, None]))
  


  # vectorizing the prompt
  promptVector = [char2idx[s] for s in prompt]
  promptVector = tf.expand_dims(promptVector, 0)

  #list to store all the text genertated
  generatedText = []

  model.reset_states()

  # actual text generation
  for i in range(genLen):

      predictions = model(promptVector)
      predictions = tf.squeeze(predictions, 0)


      #Commented out section here does the sampling with temprature, uncomment this and comment top-p sampling to switch.
      # predictions = predictions / temp
      # actualPrediction = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      
      
      #top-p sampling trial, temp section needs to be commented
      predictionsNew = topPSampling(0.8,predictions)
      actualPrediction = tf.random.categorical(predictionsNew, num_samples=1)[-1,0].numpy()


      #passing the prediction back to the model
      promptVector = tf.expand_dims([actualPrediction], 0)
      
      generatedText.append(idx2char[actualPrediction])

  return model, (prompt + ''.join(generatedText))


# This method is basically the same as the one above, it just takes a model as input such that it doesn't need to always build it from scratch.
def generate_textWithModel(prompt,genLen,temp,char2idx,idx2char,model):

  # vectorizing the prompt
  promptVector = [char2idx[s] for s in prompt]
  promptVector = tf.expand_dims(promptVector, 0)

  #list to store all the text genertated
  generatedText = []

  model.reset_states()

  # actual text generation
  for i in range(genLen):

      predictions = model(promptVector)
      predictions = tf.squeeze(predictions, 0)


      #Commented out section here does the sampling with temprature, uncomment this and comment top-p sampling to switch.
      # predictions = predictions / temp
      # actualPrediction = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      
      
      #top-p sampling trial, temp section needs to be commented
      predictionsNew = topPSampling(0.95,predictions)
      actualPrediction = tf.random.categorical(predictionsNew, num_samples=1)[-1,0].numpy()


      #passing the prediction back to the model
      promptVector = tf.expand_dims([actualPrediction], 0)
      
      generatedText.append(idx2char[actualPrediction])

  return (prompt + ''.join(generatedText))



calculate the self-bleu with text generation:

We generate 100 samples, and use a random three word sequence from previous generation as the prompt for the new sample. these samples will be used as the references.

We generate another 100 samples in the same fashion, but now we compute bleu score for each sample against the references.

lastly, we take the mean of them to get a unified self-bleu score.

In [None]:
def calculateSelfBleu(char2idx,idx2char,genLen):

  generatedRefs = []
  generatedHypos = []
  bleuScores = []
  sm = nltk.translate.bleu_score.SmoothingFunction()
  with open('zenData.txt') as f:
      flat_list=[word for line in f for word in line.split()]
  upperBound = len(flat_list)
  

  
  model, generatedString = generate_text("going across the forest ",genLen,0.6,char2idx,idx2char)
  for i in range(100):
    ranIndex = random.randint(0,upperBound-3)
    generatedRefs.append(generatedString)
    currentStringList = generatedString.split()
    #generating a new string
    currentString = flat_list[ranIndex:ranIndex+2]
    actualString = " ".join(currentString)
    generatedString = generate_textWithModel(actualString,genLen,0.6,char2idx,idx2char,model)
    print(generatedString)
    
  
  for i in range(100):
    ranIndex = random.randint(0,upperBound-3)
    currentString = flat_list[ranIndex:ranIndex+2]
    actualString = " ".join(currentString)
    generatedString = generate_textWithModel(actualString,genLen,0.6,char2idx,idx2char,model)
    currentStringList = generatedString.split()
    currentScore =nltk.translate.bleu_score.sentence_bleu(generatedRefs,generatedString,smoothing_function=sm.method1)
    bleuScores.append(currentScore)
    # print(currentScore)



  avBleu = statistics.mean(bleuScores)
  print('average self-bleu for generated Examples: ',avBleu)
  




  

Calculate the self-bleu score for the dataset itself:

This is done by sampling fifty word samples randomly from the dataset, this is done 200 times. those samples are then used as the references.
again sample fifty word samples randomly from the dataset 200 times, and we compute a self-bleu score for each of the new samples against the references.
lastly, we average all the bleu scores to receive a unified self-bleu score.

In [None]:
def selfBleuDataset():
  with open('zenData.txt') as f:
      flat_list=[word for line in f for word in line.split()]
  exampleCounter = 0
  refs = []
  upperBound = len(flat_list)-10
  while exampleCounter <100:
      rnIndex = random.randint(0,upperBound)
      if rnIndex <(upperBound-10):
          currentList =[]
          for j in range(10):
              currentWord = flat_list[rnIndex+j]
              currentWord = currentWord.translate(str.maketrans('', '', string.punctuation))
              currentList.append(currentWord)
          refs.append(" ".join(currentList))
          exampleCounter = exampleCounter+1

  hypos = []
  hypoCounter =0
  while hypoCounter <100:
      rnIndex = random.randint(0,upperBound)
      if rnIndex <(upperBound-10):
          currentList =[]
          for j in range(10):
              currentWord = flat_list[rnIndex+j]
              currentWord = currentWord.translate(str.maketrans('', '', string.punctuation))
              currentList.append(currentWord)
          hypos.append(" ".join(currentList))
          hypoCounter = hypoCounter+1

  bleulist = []
  sm = nltk.translate.bleu_score.SmoothingFunction()
  for i in range(len(hypos)):
      currentBleu = nltk.translate.bleu_score.sentence_bleu(refs,hypos[i],smoothing_function=sm.method1)
      bleulist.append(currentBleu)
  print("average self-bleu score for the dataset: ",statistics.mean(bleulist))

In [None]:
# run placeholder if you have already the trained weights
char2idx,idx2char =placeholdeR()

# char2idx,idx2char = createAndTrain(256,1024)
for i in range(20):
  model,generatedString = generate_text("sit by the river ",500,0.8,char2idx,idx2char)
  print(generatedString)
  print("------------------------------")
# model,generatedString = generate_text("eating a meal ",150,0.35,char2idx,idx2char)
# print(generatedString)
# calculateSelfBleu(char2idx,idx2char,50)
# selfBleuDataset()

sit by the river and have the form of the country, to see the best way to try to take the teaching of existence when the water is supposed to disappear. That’s not why Buddhism is always there. When we start to try to cultivate this realization or others with the self-confidence is not only some stage we should simply become part of the true existence of some definition of the present. There is a big mind. There is no way you can get out of the present. You can understand it. There is no thing to give. There is 
------------------------------
sit by the river as the door that has the pain around us, the rich of his fresh air forgotten. And so on I don’t know, what was what is existed about all the time. There is no need to be independent. We take care of our bodies and minds are more deeply than what we are not thinking or the whole world. But all of them are relieved and reflected in the present. But as soon as a relationship within one’s compassion we are basic patience, no more angr