# RNN with LSTM VS Transformer with self-attention

Key Differences:

RNN with LSTM: The LSTM processes sequences one step at a time, maintaining a hidden state that captures information about previous steps.
Transformer: The Transformer processes entire sequences simultaneously using self-attention mechanisms, making it highly efficient for long-range dependencies and parallelizable.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample text data
text = "This is a simple example of text generation using LSTM. LSTM models are useful for sequence prediction."

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Create input sequences using the tokenized text
input_sequences = []
for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors and label
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Define the model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=1)

# Generate text
seed_text = "This is a"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    output_word = tokenizer.index_word[predicted[0]]
    seed_text += " " + output_word

print(seed_text)


## Breaking down the Transformerm

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

### Tokenization

The process of converting text data into numerical data is called tokenization. Tokenization is a crucial step in natural language processing (NLP) tasks, as most machine learning algorithms require numerical data as input. In the context of text generation, tokenization is used to convert words into numerical tokens that can be used as input to a neural network model.

In [None]:

from pprint import pprint

# Sample text data
text = "This is a simple example of text generation using Transformer. Transformers are powerful models for sequence prediction."

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

pretty_word_index = str(tokenizer.word_index).replace(", ", ",\n")
print(f"""
Tokenizer Explanation:
- The tokenizer is used to convert text data into sequences of tokens.
- The `fit_on_texts` method is used to fit the tokenizer on the text data.
- The `word_index` attribute of the tokenizer contains a dictionary mapping words to their indices.
- `word_index` + 1 is done to account for the padding token.

Total Words: {total_words}
Word Index: 
{pretty_word_index}
""")

### N-gram Tokenized Sequence Generation

#### - Tokenizing

In [None]:

# Create input sequences using the tokenized text
input_sequences = []

print("Splitting text into lines and creating n-grams...")
for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    print(f"""
    -> Line: {line}
    -> Token List: {token_list}
    """.strip())
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        print(f"\t-> N-gram Sequence: {n_gram_sequence}")

In [None]:
pretty_input_sequences = str(input_sequences).replace("],", "],\n")
print(f"""
Explanation:
- The text is split into lines.
- Each line is tokenized using the tokenizer.
- For each token list, n-gram sequences are created by taking the first i tokens.
- The n-gram sequences are added to the input sequences.

Input Sequences:
{pretty_input_sequences}   
""")

#### - Padding Sequences

In [None]:
# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

pretty_padded_input_sequences = str(input_sequences).replace("],", "],\n")
print(f"""
Explanation:
- The input sequences are padded to the maximum sequence length.
- The maximum sequence length is the length of the longest n-gram sequence.
- The input sequences are converted to a numpy array.
- The sequences are padded with zeros at the beginning.

Padded Input Sequences: 
{pretty_padded_input_sequences}
""")

#### - Create Predictors and Label

In [None]:
# Create predictors and label
X, y = input_sequences[:,:-1], input_sequences[:,-1]

print(f"""
Explanation:
- The predictors are the input sequences without the last token.
- The label is the last token in the input sequences.
- The label is the last token because we are predicting the next token.
""")
print(f"""
Predictors:
{X}
""".strip())
print(f"""
Label:
{y}
""")


#### - Hot Encoding the Label

In [None]:
hot_y = tf.keras.utils.to_categorical(y, num_classes=total_words)

print(f"""
Explanation:
- The number of classes is the total number of unique words in the text.
- The label is represented as a one-hot encoded vector.
    
One-hot Encoded Label:
{hot_y}
""")
      

In [None]:
import pandas as pd

print(f"""
Visualization Explanation:
We are converting the one-hot encoded label to a DataFrame for better visualization.
So we can understand the one-hot encoded label better.
""")

word_idx_columns = ["%s_%s" % (word, idx) for word, idx in tokenizer.word_index.items()]
pd.DataFrame(hot_y, columns=["padding_index_0"]+ word_idx_columns)

In [None]:

# Define the Transformer model
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

maxlen = max_sequence_len - 1
embed_dim = 32
num_heads = 2
ff_dim = 32

inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, total_words, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = Dense(total_words, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=100, verbose=1)

# Generate text
seed_text = "This is a"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    output_word = tokenizer.index_word[predicted[0]]
    seed_text += " " + output_word

print(seed_text)
