# Implementing text generation with Keras

In [37]:
# !wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz

In [19]:
import tensorflow as tf
from tensorflow import keras

In [22]:
# import chardet

# with open('0_2.txt', 'rb') as f:
#     rawdata = f.read()
#     result = chardet.detect(rawdata)
#     encoding = result['encoding']

In [38]:
# encoding

In [33]:
import os
import re

def remove_html_line_breaks(text):
  """Removes HTML line breaks (<br />) from a text string and replaces them with spaces.

  Args:
      text: The text string to process.

  Returns:
      The text string with HTML line breaks removed.
  """
  return re.sub(r"<br />", " ", text)

def process_text_files(directory):
  """Processes text files in a directory, removing HTML line breaks.

  Args:
      directory: The directory containing the text files.

  Returns:
      A list of processed text strings.
  """
  processed_texts = []
  for subdir, _, filenames in os.walk(directory):
    for filename in filenames:
      filepath = os.path.join(subdir, filename)
      with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()
        processed_text = remove_html_line_breaks(text)
        processed_texts.append(processed_text)
  return processed_texts

In [34]:
# Example usage
directory = "aclImdb/"  # Replace with your directory path
processed_texts = process_text_files(directory)

In [39]:
# processed_texts

In [41]:
# # Access processed text (assuming no labels):
# for text in processed_texts:
#   print(text)

In [45]:
dataset = processed_texts
# dataset = keras.utils.text_dataset_from_directory(
#     directory='aclImdb', label_mode=None, batch_size=256,
# )
# dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", ' '))

In [46]:
from keras.layers import TextVectorization

In [47]:
sequence_length=100
vocab_size=15000
text_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length,
)

In [48]:
text_vectorization.adapt(dataset)

In [49]:
def prepare_lm_dataset(text_batch):
    vectorized_sequences=text_vectorization(text_batch)
    x = vectorized_sequences[:, :-1]
    y = vectorized_sequences[:, 1:]
    return x, y

In [50]:
lm_dataset = prepare_lm_dataset(dataset)

In [51]:
from tensorflow.keras import layers

In [52]:
embed_dim = 256
latent_dim = 2048
num_heads = 2

In [54]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
        input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
        input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    def get_config(self):
        config = super().get_config()
        config.update({
        "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,})
        return config

In [55]:
inputs = keras.Input(shape=(None, ), dtype='int64')
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)




ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```
