# Text Generator

## Load data

In [None]:
import pandas as pd

lord_of_the_rings = pd.read_parquet("hf://datasets/jeremyarancio/lotr-book/data/train-00000-of-00001-0402ae8116935261.parquet")
lord_of_the_rings.head()

In [None]:
lord_of_the_rings = lord_of_the_rings["text"].values[0].lower()
len(lord_of_the_rings)

In [None]:
lord_of_the_rings[:100]

## Tokenization

In [None]:
# prompt: Convert the lord_of_the_rings string varrible to array which consist of characters from string.

lord_of_the_rings_array = list(lord_of_the_rings)

In [None]:
lord_of_the_rings_array[:10]

## Prepair Train, Val and Test dataset

In [None]:
# prompt: Chunk the lord_of_the_rings_array with a sliding window.
# Lenght of the sliding window should be 50 characters.
# Create a 2 dimasional array called X to chunk.
# And creat an y varrible which includes the each 50+1 elements.
import numpy as np
from tqdm import tqdm

X = []
y = []
window_size = 50

for i in tqdm(range(0, len(lord_of_the_rings_array) - window_size)):
  X.append(lord_of_the_rings_array[i:i + window_size])
  y.append(lord_of_the_rings_array[i + window_size])

X = np.array(X)
y = np.array(y)

print(X.shape)
print(y.shape)

In [None]:
# prompt: Split train val and test set from X, y arrays. X is the input and y is the label.
# The ratio should be 80% 10% 10% and you dont shuffle the data when you split.

import numpy as np

# Assuming X and y are your input and label arrays

train_split = int(0.8 * len(X))
val_split = int(0.9 * len(X))

X_train = X[:train_split]
y_train = y[:train_split]

X_val = X[train_split:val_split]
y_val = y[train_split:val_split]

X_test = X[val_split:]
y_test = y[val_split:]

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
# Create a character to index mapping
char_to_index = {char: index for index, char in enumerate(sorted(list(set(lord_of_the_rings_array))))}
index_to_char = {index: char for index, char in enumerate(sorted(list(set(lord_of_the_rings_array))))}

# Convert training and validation data to numerical representation
X_train_indices = [[char_to_index[char] for char in sequence] for sequence in tqdm(X_train)]
y_train_indices = [char_to_index[char] for char in tqdm(y_train)]

X_val_indices = [[char_to_index[char] for char in sequence] for sequence in tqdm(X_val)]
y_val_indices = [char_to_index[char] for char in tqdm(y_val)]

X_test_indices = [[char_to_index[char] for char in sequence] for sequence in tqdm(X_test)]
y_test_indices = [char_to_index[char] for char in tqdm(y_test)]

X_train_indices = np.array(X_train_indices)
y_train_indices = np.array(y_train_indices)

X_val_indices = np.array(X_val_indices)
y_val_indices = np.array(y_val_indices)

X_test_indices = np.array(X_test_indices)
y_test_indices = np.array(y_test_indices)

## Model

In [None]:
# prompt: Create an GRU network which can learn to generate the next caracter.
# Use the tensorflow librarary. The trainin input is in X_train and label is the y_train.
# The validataion dataset are X_val and y_val. Create a test generation a sample text in each epoch
# that we can see the performance of model.

import tensorflow as tf

vocab_size = len(char_to_index)
embedding_dim = 128
rnn_units = 50

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GRU(rnn_units, return_sequences=True),
    tf.keras.layers.GRU(rnn_units),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# ## Training
epochs = 2
batch_size = 32

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.fit(X_train_indices, y_train_indices, batch_size=batch_size, epochs=1, validation_data=(X_val_indices, y_val_indices))

    # Generate a sample text
    start_index = np.random.randint(0, len(X_test_indices) - 1)
    # Convert generated_text to a regular Python string
    generated_text = "".join(X_test[start_index])
    ## print(generated_text)
    for _ in tqdm(range(100)):
        input_sequence = np.array([char_to_index[char] for char in generated_text[-window_size:]])
        input_sequence = np.expand_dims(input_sequence, axis=0)

        predicted_probabilities = model.predict(input_sequence, verbose=0)[0]
        predicted_index = np.argmax(predicted_probabilities)
        predicted_char = index_to_char[predicted_index]

        generated_text += predicted_char

    print(f"Generated text:{generated_text}")


In [None]:
model.summary()

## Temperature

In [None]:
# prompt: Create sample code where i can setup the temprature of the generation on the
# model prediciton.

# ## Temperature

def generate_text_with_temperature(model, start_sequence, length, temperature=1.0):
    """Generates text using the model with temperature."""

    generated_text = start_sequence
    for _ in tqdm(range(length)):
        input_sequence = np.array([char_to_index[char] for char in generated_text[-window_size:]])
        input_sequence = np.expand_dims(input_sequence, axis=0)

        predicted_probabilities = model.predict(input_sequence, verbose=0)[0]

        # Apply temperature
        predicted_probabilities = np.log(predicted_probabilities) / temperature
        probabilities = np.exp(predicted_probabilities) / np.sum(np.exp(predicted_probabilities))

        predicted_index = np.random.choice(len(probabilities), p=probabilities)
        predicted_char = index_to_char[predicted_index]

        generated_text += predicted_char

    return generated_text

# Example usage with temperature:
start_index = np.random.randint(0, len(X_test_indices) - 1)
start_sequence = "".join(X_test[start_index])
temp = 0.7  # Adjust temperature here
generated_text = generate_text_with_temperature(model, start_sequence, 100, temperature=temp)  # Adjust temperature here
print(f"Generated text with temperature ({temp}):{generated_text}")

# You can experiment with different temperature values (e.g., 0.2, 0.7, 1.2) to see how it affects the generated text.

In [None]:
# prompt: create code which save 2 tsv file the embedding layer from the model. The first consist of the characters and the second consist of the vectors to the characters. To separation use '\t' character in the tsv file.

# ... (Your existing code) ...

embedding_layer = model.layers[0]
embedding_weights = embedding_layer.get_weights()[0]

# Create a list to store characters and their corresponding vectors
chars = []
vectors = []

for char, index in list(char_to_index.items())[3:]:
  chars.append(char)
  vectors.append(embedding_weights[index])

chars
# Save characters to a TSV file
with open("chars.tsv", "w", encoding="utf-8") as f:
    for char in chars:
        f.write(f"{char}\n")

# Save vectors to a TSV file
with open("vectors.tsv", "w", encoding="utf-8") as f:
    for vector in vectors:
        vector_str = "\t".join(map(str, vector))  # Join vector elements with tabs
        f.write(f"{vector_str}\n")

# ... (Rest of your code) ...