In [3]:
#import lib
import os 
import warnings

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings

In [None]:
import time
import numpy as np
import tensorflow as tf


In [5]:
#Downlaod the dataset Shakespeare's works
path_to_file = tf.keras.utils.get_file(
    "shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt",
)

In [6]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f"Length of text: {len(text)} characters")

Length of text: 1115394 characters


In [7]:
print(text[:250])  # Print the first 250 characters

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [8]:
print(text[:1])  # Print the first 1 characters

F


In [9]:
vocab = sorted(set(text))
print(f"{len(vocab)} unique chareacters")

65 unique chareacters


In [32]:
vocab[:10]

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']

# Understanding embedding from text to numbers 

In [None]:
example_texts = ["abcdefg", "xyz"]

# You're breaking each word into individual characters so a machine learning model can understand and work with them.
# This is useful for tasks like text generation, where you want to predict the next character based on the previous ones.
# The `tf.strings.unicode_split` function is used to split the input strings into individual Unicode characters.

# TODO 1
chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [None]:
# You're converting characters into unique numbers (IDs) so that a neural network can work with them.
#  funtion to convert characters to IDs
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None
)

In [None]:
# ids_from_chars stores a mapping from each character (like 'a', 'b', 'c') to a unique number (integer ID) — so the computer can understand it.
ids_from_chars(['a','b']) # Example: This will return the IDs for 'a' and 'b' based on the mapping created by `ids_from_chars`.

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([40, 41])>

In [18]:
#  try this on example text , but the ids are based on the entire vocabulary of Shakespeare's works, not just the example texts.
ids = ids_from_chars(chars)
print(ids)

<tf.RaggedTensor [[40, 41, 42, 43, 44, 45, 46], [63, 64, 65]]>


In [None]:
# Now, we need to create a reverse mapping from IDs back to characters. 
# This is useful for converting model predictions (which are in the form of IDs) back into readable text.
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary = ids_from_chars.get_vocabulary(),
    invert=True, mask_token=None
)

In [21]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [47]:
output = tf.strings.reduce_join(chars,axis=1).numpy()
output # Convert the IDs back to characters and decode to a string


array([b'abcdefg', b'xyz'], dtype=object)

In [30]:
def text_from_ids(ids):
    """Converts an array of character IDs back to a string."""
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1).numpy().decode('utf-8')


# Work with all datasets {text}

In [33]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, input_encoding="UTF-8"))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
ids_dataset  # this will be used to create sequences of IDs for training the model

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [38]:
for ids in ids_dataset.take(5):
    print(chars_from_ids(ids).numpy().decode('utf-8'), end='')

First

In [None]:
seq_len = 100  # Length of each sequence
example_per_epoch = len(text) // (seq_len + 1)  # Number of sequences per epoch

In [52]:
sequence = ids_dataset.batch(seq_len + 1, drop_remainder=True)
print(f"Number of sequences in the dataset: {len(list(sequence))}")
for seq in sequence.take(1):
    print(chars_from_ids(seq) )

Number of sequences in the dataset: 11043
tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' '], shape=(101,), dtype=string)


In [53]:
for seq in sequence.take(1):
    print(text_from_ids(seq))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


In [55]:
# function to split a sequence into input and target sequences
def split_input_target(sequence):
    """Splits a sequence into input and target sequences."""
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [58]:
split_input_target(list("TensorFlow"))  # expmale 

(['T', 'e', 'n', 's', 'o', 'r', 'F', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'F', 'l', 'o', 'w'])

In [59]:
dataset = sequence.map(split_input_target)
dataset

<_MapDataset element_spec=(TensorSpec(shape=(100,), dtype=tf.int64, name=None), TensorSpec(shape=(100,), dtype=tf.int64, name=None))>

In [66]:
for example_input, example_target in dataset.take(1):
    print(f"Input: {text_from_ids(example_input)} \n\n")
    print(f"Target: {text_from_ids(example_target)}")

Input: First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


Target: irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


In [67]:
#  trining batch size and buffer size
BATCH_SIZE = 64
BUFFER_SIZE = 10000  # Buffer size for shuffling the dataset

dataset = (
    dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)  # Prefetching for performance
)

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [68]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [69]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        # TODO - Create an embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        # TODO - Create a GRU layer
        self.gru = tf.keras.layers.GRU(
            rnn_units, return_sequences=True, return_state=True
        )
        # TODO - Finally connect it with a dense layer
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = self.embedding(inputs, training=training)
        # since we are training a text generation model,
        # we use the previous state, in training. If there is no state,
        # then we initialize the state
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [70]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
)

TypeError: Layer.__init__() takes 1 positional argument but 2 were given