In [84]:
import tensorflow as tf
import numpy as np
import os
import time
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense



## code adopted from tf, pytorch and karpathy blog

In [58]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [59]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [60]:
# Take a look at the first 400 characters in text
print(text[:400])
# The unique characters in the file
vocab = sorted(set(text))
char_to_index = {char: idx for idx, char in enumerate(vocab)}
index_to_char = np.array(vocab)


print(f'{len(vocab)} unique characters')
example_texts = ['NLPUSF', 'Assignment3']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
print(chars)
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
chars = chars_from_ids(ids)
tf.strings.reduce_join(chars, axis=-1).numpy()

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 
65 unique characters
<tf.RaggedTensor [[b'N', b'L', b'P', b'U', b'S', b'F'],
 [b'A', b's', b's', b'i', b'g', b'n', b'm', b'e', b'n', b't', b'3']]>


array([b'NLPUSF', b'Assignment3'], dtype=object)

In [61]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))
seq_length = 140
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text
split_input_target(list("Tensorflow"))

dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 256

class NLPUSFModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

F
i
r
s
t
 
C
i
t
i
tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' ' b'a' b'r' b'e' b' ' b'a' b'l' b'l' b' ' b'r' b'e' b's'
 b'o' b'l' b'v' b'e' b'd' b' ' b'r' b'a' b't' b'h' b'e' b'r' b' ' b't'
 b'o' b' ' b'd' b'i' b'e' b' ' b't' b'h' b'a' b'n' b' ' b't' b'o' b' '
 b'f'], shape=(141,), dtype=string)
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to f'
b"amish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, 

In [62]:
print(vocab_size)
print(embedding_dim)
print(rnn_units)

model = NLPUSFModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

66
256
256


In [63]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary()

(64, 140, 66) # (batch_size, sequence_length, vocab_size)
Model: "nlpusf_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     multiple                  16896     
                                                                 
 gru_2 (GRU)                 multiple                  394752    
                                                                 
 dense_7 (Dense)             multiple                  16962     
                                                                 
Total params: 428610 (1.64 MB)
Trainable params: 428610 (1.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [64]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)
tf.exp(example_batch_mean_loss).numpy()

Input:
 b'into the bottom of my grief?\nO, sweet my mother, cast me not away!\nDelay this marriage for a month, a week;\nOr, if you do not, make the brid'

Next Char Predictions:
 b"zMDDw$;FTPfDoxNW \nKScDHrKN-;WekcGT\nRmo:ZG& -x IAVgDt!WuXSM!DWFsPEwk!IUiuka3av:lsNo?qCMBuN\n\nXsxYaitZoQeo?AmDlRsvbA?hQaNasTgG!Plaac'm:WL.HdUv&"
Prediction shape:  (64, 140, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.1913886, shape=(), dtype=float32)


66.11453

In [65]:
model.compile(optimizer='adam', loss=loss)
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 20
# Start training your model
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [67]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [68]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [71]:
start = time.time()
states = None
next_char = tf.constant(['Queen:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Queen:
I am not more of-a bold, we such libedy.

KING HENRY VI:
He sun to say. For fish it at the stomest o'er what?
But when it ourselves in the sons
Bid him too thing your runhage, or, the hand
till their bannain'd aspect of ght in his face is discondemn?
And God-dried a cobbal desertary?
And in the proyal rator hows
So farewells, like a holy--him.

First Watch'd Ox the putice:
Huntly some soul holds will speak' telling her boan!
I could find perform'd it not say, I think
for a joy; but in a numselfron with this woe's gentle joy
Withal? Lord same at King Caminione;
A thing and solmorio.
O thus palt of my cheek witest;
And that it with the duke twell him, good:
You are fancyful morning of her credit in my maid
you know any this charity--

QUEEN MARGARET:
Adward at him back'd name.

BRUTUS:
What, that said these:
Soft! and unfated royal offer him did in such as a cruel.
Adive, at corrural pease, he doth said the bare,
For I had banishment's deservess
And I do, poor sonish'd comferminat

In [114]:
print("Testing LSTM")
class CustomLSTMCell(keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(CustomLSTMCell, self).__init__(**kwargs)
        self.units = units
        self.state_size = [units, units]  # Hidden state size and cell state size

    def build(self, input_shape):
        input_dim = input_shape[-1]
        # One can play with init to stabalize learning, remember what we discussed for MLP
        # As described in class LSTM is simply 4 different RNNs (h_t = sigma(Wx_t + Uh_{t-1} + b)) working in parallel, but connected jointly.
        # Weights for the input gate
        self.W_i = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_i')
        self.U_i = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_i')
        self.b_i = self.add_weight(shape=(self.units,), initializer='zeros', name='b_i')

        # Weights for the forget gate
        self.W_f = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_f')
        self.U_f = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_f')
        self.b_f = self.add_weight(shape=(self.units,), initializer='zeros', name='b_f')

        # Weights for the cell state
        self.W_c = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_c')
        self.U_c = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_c')
        self.b_c = self.add_weight(shape=(self.units,), initializer='zeros', name='b_c')

        # Weights for the output gate
        self.W_o = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_o')
        self.U_o = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_o')
        self.b_o = self.add_weight(shape=(self.units,), initializer='zeros', name='b_o')

        super(CustomLSTMCell, self).build(input_shape)

    def call(self, inputs, states):
        h_tm1, c_tm1 = states  # Previous state

        # Input gate
        i = tf.sigmoid(tf.matmul(inputs, self.W_i) + tf.matmul(h_tm1, self.U_i) + self.b_i)

        # Forget gate
        f = tf.sigmoid(tf.matmul(inputs, self.W_f) + tf.matmul(h_tm1, self.U_f) + self.b_f)

        # Cell state
        c_ = tf.tanh(tf.matmul(inputs, self.W_c) + tf.matmul(h_tm1, self.U_c) + self.b_c)
        c = f * c_tm1 + i * c_

        # Output gate
        o = tf.sigmoid(tf.matmul(inputs, self.W_o) + tf.matmul(h_tm1, self.U_o) + self.b_o)

        # New hidden state
        h = o * tf.tanh(c)

        return h, [h, c]
    
class OneStepLSTMModel:
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars
        self.temperature = temperature

    def generate_one_step(self, inputs, states=None):
        # Convert input characters to IDs
        input_ids = self.ids_from_chars(inputs)
        input_ids = tf.expand_dims(input_ids, 0)
        
        # Assuming `self.model` is your Sequential model with a CustomLSTMCell,
        # here, do not pass `states` directly to `self.model` call.
        # Instead, if managing states is necessary, it has to be handled internally 
        # within the CustomLSTMCell or by adjusting the model to allow external state management.

        # Get predictions from the model
        predicted_logits = self.model(input_ids)
        predicted_logits = predicted_logits[:, -1, :] / self.temperature
        
        # Sample the output logits to generate token IDs
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)
        
        # Convert from token IDs to characters
        predicted_chars = self.chars_from_ids(predicted_ids)
        
        # Here, states are not modified or returned as they're internally managed by the LSTM layer.
        return predicted_chars, states


    
# Create the Sequential lstmModel
lstmModel = Sequential(name="nlpusf_lstmModel_2")
lstmModel.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
lstmModel.add(tf.keras.layers.RNN(CustomLSTMCell(rnn_units), return_sequences=True))
lstmModel.add(tf.keras.layers.RNN(CustomLSTMCell(rnn_units), return_sequences=True))
lstmModel.add(tf.keras.layers.RNN(CustomLSTMCell(rnn_units), return_sequences=True))
lstmModel.add(Dense(vocab_size))

# Compile the model
lstmModel.compile(optimizer='adam', loss='mean_squared_error')

# Print model summary
lstmModel.summary()

Testing LSTM
Model: "nlpusf_lstmModel_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_21 (Embedding)    (None, None, 256)         16896     
                                                                 
 rnn_24 (RNN)                (None, None, 256)         525312    
                                                                 
 rnn_25 (RNN)                (None, None, 256)         525312    
                                                                 
 rnn_26 (RNN)                (None, None, 256)         525312    
                                                                 
 dense_27 (Dense)            (None, None, 66)          16962     
                                                                 
Total params: 1609794 (6.14 MB)
Trainable params: 1609794 (6.14 MB)
Non-trainable params: 0 (0.00 Byte)
_______________________________________________________________

In [115]:
one_step_lstmModel = OneStepLSTMModel(model=lstmModel, chars_from_ids=chars_from_ids, ids_from_chars=ids_from_chars)

In [116]:
start = time.time()

# Initialize states and the first character
states = None
next_char = tf.constant(['Queen:'])  # Starting string
result = [next_char]

for n in range(1000):  # Generate 1000 characters
    next_char, states = one_step_lstmModel.generate_one_step(next_char, states=states)
    result.append(next_char)

# Join the list of strings into a single string
result = tf.strings.join(result)

end = time.time()

# Print the generated text and the runtime
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Queen:ptAxI
mPkk-craoxMAQ-,a&DLxQH;
,rD:$njVqfeF:REdV,;fI?cixwz?:XHG3qPWkPlIdWdtNGKbkMQ?zCT$AbVRn;viGGRM-WnaPQqwQx$jUlGLIhKELWfuASqCuE!u$tMPvBUCSQr
 nFjz:TEe&D'DQCvLAMcAgiz:gdWuBFT$fiVG bgSgQhIr&J-mnyFFPuM TeFnyXbXWT?Gz?OADIJSA-ufMbNGOpkq&sBAjnGifUo.HMDp!jf'GJ
3P&o,QDNpXEUvLgu 
aSNiAxtGMT hbv
HLpf3hDW&uQ$'W:
ykhtJaHt;;RKn' ja?'
haz'mCBPJ.XXzWjxXDptZ
jCF[UNK]Tm.s[UNK]&nGx lCCEQcEgp
GWnoZDOL.'AGF.BIOQ-caDGfEgU$CyVAzVjwZ 3xI,;:'iDg?pHlM t;ot'G[UNK]Vn-?D-NsvA:[UNK]'DNJV3SCEzYMUSK$l?GpJOksVyGvFRW$ewyYQvuJJtila:Fa.SSLtFDsW.noGGL[UNK]dPTHoxycPpCKfC,!ofQtwf$XZ&eOGqYoWF:
oigeCSqJBqyKoQun
Yg?DFpubBJiCK:H$vHiVo',&Kb&q.UesIjVHUgvW!UlI??DYVMF$MNGlSHB3uFq3kz'AOvkGyo?tX[UNK]3yhImipJENRroIvh:GrH$WDA PZF
3xaqEJWEiYxoW
jaYgvYsDvLvSZI
,E d.
uQFm[UNK]E'sBqzccRr-UGs:jhmNBs.qjYGcmAL$piHft[UNK]u
j3HssI$dl,PzmGLeKPf'P IWqSvoVrKup.GZxUDFnJr?
xO,f$fSVtnkc.,KWipoRz3
[UNK]y-O;XYz$HBPV?G$YjgNOpR[UNK]FJxkCmDbyDf-Th?Ay-j3sfvAqjxi,PmqapK LaN:DrTFkT-eprVEG$aV.IMQNxmt?krgNDJkaNR[UNK]YS&lXGt3[UNK]QS!C[UNK]?npWLz,xOZoTk 

In [None]:
def BEAM_SEARCH(RNN, start_sequence, beam_width):
    # Convert start_sequence to tensor
    input_eval = tf.expand_dims(start_sequence, 0)
    
    # Empty list to store our results
    sequences = [(input_eval, 0)]  # Each element is (sequence, log_prob)

    for _ in range(100):
        all_candidates = []
        for seq, score in sequences:
            predictions = RNN(seq)
            # Apply softmax to convert to probability distribution
            probabilities = tf.nn.softmax(predictions, axis=-1).numpy()
            # Consider the last timestep; get top `beam_width` probabilities and their indices
            last_probs = probabilities[0, -1]
            top_indices = np.argpartition(last_probs, -beam_width)[-beam_width:]
            top_probs = last_probs[top_indices]

            # Generate candidates and update their scores
            for i, prob in zip(top_indices, top_probs):
                next_seq = tf.concat([seq, tf.expand_dims([i], 0)], axis=-1)
                next_score = score + np.log(prob)
                all_candidates.append((next_seq, next_score))

        # Sort all candidates by score and select top `beam_width`
        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        sequences = ordered[:beam_width]

    # Convert the sequences back to characters and print the best one
    best_sequence = sequences[0][0].numpy()[0]
    best_sequence_chars = [index_to_char[index] for index in best_sequence]  # Convert indices to characters
    return ''.join(best_sequence_chars)

print("JGC")
start_char = 'Q'
start_sequence = [char_to_index[start_char]]  # Convert to index

beam_width = 5  # Set the beam width for the search

# Now, we would call BEAM_SEARCH with our model and parameters
best_sequence = BEAM_SEARCH(model, start_sequence, beam_width)

print("Best sequence:", best_sequence)

## Simple beam search pseudocode, adapt this to
function BEAM_SEARCH(RNN, start_sequence, beam_width):
    # RNN: the recurrent neural network model for sequence generation (custom LSTM, GRU, custom Elman RNN)
    # start_sequence: the initial part of the sequence (could be just a start symbol or set of symbols)
    # beam_width: the number of sequences to keep at each step -- This is another hyper-parameter, play with it, as discussed in class, beam search will still provide you sub-optimal solution

    Initialize an empty list `candidates` to store current sequence candidates -- One can use other datastructures, to optimize overall workeflow
    Initialize an empty list `final_candidates` to store completed sequences
    
    Add start_sequence to `candidates` with its score (e.g., log likelihood)

    while not all sequences in `candidates` are complete:
        Initialize an empty list `all_expansions` for storing all possible next steps

        for each sequence in `candidates`:
            if the sequence is complete:
                Add it to `final_candidates`
                Continue to the next iteration

            Predict the next step probabilities using RNN given the current sequence
            Select top-k next steps (where k is the beam width) based on probabilities

            for each next step in top-k:
                Create a new sequence by appending the next step to the current sequence
                Calculate the new sequence's score (e.g., update log likelihood)
                Add the new sequence and its score to `all_expansions`

        Sort `all_expansions` by score in descending order
        Keep only the top `beam_width` sequences in `all_expansions`
        Replace `candidates` with `all_expansions`

    Add any remaining sequences in `candidates` to `final_candidates`
    Sort `final_candidates` by score in descending order

    return the top sequence from `final_candidates` (or top-N sequences if desired)

# Usage example
1. RNN = InitializeYourRNNModel()
2. start_sequence = ["<start>"]  # Example start symbol
3. beam_width = 5  # Example beam width
4. best_sequence = BEAM_SEARCH(RNN, start_sequence, beam_width)
5. print("Best sequence:", best_sequence)
Check above step on one-step this will provide you with tricks that will be useful to create beam-search

# Things to do
1. Integrate custom_beamsearch with your models
1. Optimize your hyper-parameter --> Learning rate, hidden_size, layers, optimizer, epochs, batch_size
2. Divide dataset into train, validation, and test, once your model gets reasonable performance (lower loss), then test the story generation capability of your system
3. Replace GRU with custom LSTM shared with you and test how it works
4. Create custom Elman RNN (h_t = tanh(X_tW + Uh_{t-1} + b)) and compare performance across different RNNs (Custom_ElmanRNN, GRU, Custom_LSTM). Also provide loss curves for each models and saved weights.
5. Provide statistical significance of your model
6. Show different texts generated by your models