In [None]:
#!wget https://www.gutenberg.org/cache/epub/1041/pg1041.txt
!wget https://www.gutenberg.org/ebooks/100.txt.utf-8


--2025-11-18 09:37:26--  https://www.gutenberg.org/ebooks/100.txt.utf-8
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://www.gutenberg.org/cache/epub/100/pg100.txt [following]
--2025-11-18 09:37:27--  http://www.gutenberg.org/cache/epub/100/pg100.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.gutenberg.org/cache/epub/100/pg100.txt [following]
--2025-11-18 09:37:28--  https://www.gutenberg.org/cache/epub/100/pg100.txt
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5638525 (5.4M) [text/plain]
Saving to: ‘100.txt.utf-8’


2025-11-18 09:37:30 (2.97 MB/s) - ‘100.txt.utf-8’ saved [5638525/563

In [None]:
def extract_gutenberg_content(source_filename, target_filename):
    """
    Reads a text file line by line to extract content between
    '*** START OF THE PROJECT GUTENBERG EBOOK' and
    '*** END OF THE PROJECT GUTENBERG EBOOK', then writes it to a new file.
    Handles large files efficiently by not loading the entire file into RAM.

    Args:
        source_filename (str): The path to the input text file.
        target_filename (str): The path to the output text file.
    """
    try:
        with open(source_filename, 'r', encoding='utf-8') as infile, open(target_filename, 'w', encoding='utf-8') as outfile:
            inside_content = False
            start_marker = '*** START OF THE PROJECT GUTENBERG EBOOK'
            end_marker = '*** END OF THE PROJECT GUTENBERG EBOOK'

            for line in infile:
                if not inside_content:
                    if start_marker in line:
                        inside_content = True
                        # Optionally skip the start marker line itself in the output
                        continue
                else:
                    if end_marker in line:
                        inside_content = False
                        break  # Stop processing after finding the end marker
                    # Write the line if we are inside the desired content block
                    outfile.write(line)
        print(f"Successfully extracted content from '{source_filename}' to '{target_filename}'")
    except FileNotFoundError:
        print(f"Error: The file '{source_filename}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")




In [None]:
# Example usage with the downloaded file:
#source_file = 'palamarchuk.txt'
#source_file = 'pg1041.txt'
source_file = '100.txt.utf-8'
target_file = 'Shakespeare.txt'
extract_gutenberg_content(source_file, target_file)

Successfully extracted content from '100.txt.utf-8' to 'Shakespeare.txt'


In [None]:
import re

def create_vocabulary(filename):
    """
    Reads a text file, counts the occurrences of each word, and returns a dictionary
    where keys are words and values are their counts.
    Handles case-insensitivity and removes punctuation.

    Args:
        filename (str): The path to the input text file.

    Returns:
        dict: A dictionary with words as keys and their frequencies as values.
    """
    vocabulary = {}
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                # Convert to lowercase and find all word characters (alphanumeric + underscore)
                words = re.findall(r'\b\w+\b', line.lower())
                for word in words:
                    vocabulary[word] = vocabulary.get(word, 0) + 1
    except FileNotFoundError:
        print(f"Error: The file '{filename}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    return vocabulary



In [None]:

# Example usage with the 'sonnets.txt' file
vocabulary_dict = create_vocabulary(target_file) # target_file was defined as 'sonnets.txt'

if vocabulary_dict:
    print(f"Vocabulary created successfully from '{target_file}'. Total unique words: {len(vocabulary_dict)}")
    # Optionally, print a few common words
    print("\nTop 10 most common words:")
    sorted_words = sorted(vocabulary_dict.items(), key=lambda item: item[1], reverse=True)
    for word, count in sorted_words[:10]:
        print(f"'{word}': {count}")

Vocabulary created successfully from 'Shakespeare.txt'. Total unique words: 25750

Top 10 most common words:
'the': 30334
'and': 28500
'i': 24188
'to': 20762
'of': 18756
'a': 16436
'you': 14653
'my': 13201
'in': 12525
'that': 12267


In [None]:
vocabulary_dict

{'the': 30334,
 'complete': 20,
 'works': 29,
 'of': 18756,
 'william': 85,
 'shakespeare': 4,
 'by': 4123,
 'contents': 67,
 'sonnets': 5,
 'all': 4343,
 's': 8621,
 'well': 2396,
 'that': 12267,
 'ends': 55,
 'tragedy': 28,
 'antony': 534,
 'and': 28500,
 'cleopatra': 275,
 'as': 6209,
 'you': 14653,
 'like': 2028,
 'it': 8244,
 'comedy': 12,
 'errors': 15,
 'coriolanus': 209,
 'cymbeline': 130,
 'hamlet': 473,
 'prince': 775,
 'denmark': 29,
 'first': 1393,
 'part': 633,
 'king': 3204,
 'henry': 684,
 'fourth': 65,
 'second': 631,
 'life': 967,
 'fifth': 36,
 'sixth': 35,
 'third': 280,
 'eighth': 6,
 'death': 986,
 'john': 516,
 'julius': 19,
 'caesar': 609,
 'lear': 234,
 'love': 2462,
 'labour': 113,
 'lost': 290,
 'macbeth': 288,
 'measure': 99,
 'for': 8278,
 'merchant': 69,
 'venice': 78,
 'merry': 183,
 'wives': 63,
 'windsor': 46,
 'a': 16436,
 'midsummer': 5,
 'night': 785,
 'dream': 139,
 'much': 1101,
 'ado': 21,
 'about': 430,
 'nothing': 681,
 'othello': 336,
 'moor': 8

In [None]:
def create_probability_dict(vocabulary_dict):
    """
    Converts a word count dictionary into a word probability dictionary.

    Args:
        vocabulary_dict (dict): A dictionary with words as keys and their frequencies as values.

    Returns:
        dict: A dictionary with words as keys and their probabilities as values.
    """
    total_words = sum(vocabulary_dict.values())
    probability_dict = {word: count / total_words for word, count in vocabulary_dict.items()}
    return probability_dict


In [None]:

# Example usage:
probability_dict = create_probability_dict(vocabulary_dict)

if probability_dict:
    print(f"\nProbability dictionary created successfully. Showing probabilities for the top 10 most common words:")
    # Use the previously sorted_words for consistency
    for word, count in sorted_words[:10]:
        print(f"'{word}': {probability_dict[word]:.6f}")



Probability dictionary created successfully. Showing probabilities for the top 10 most common words:
'the': 0.030551
'and': 0.028704
'i': 0.024361
'to': 0.020910
'of': 0.018890
'a': 0.016553
'you': 0.014758
'my': 0.013295
'in': 0.012615
'that': 0.012355


In [None]:
probability_dict['murder']

0.0001480505668221701

In [None]:
def create_word2index(vocabulary):
    """
    Creates a mapping from words to unique integer indices.

    Args:
        vocabulary (dict): A dictionary where keys are words.

    Returns:
        dict: A dictionary mapping words to their integer indices.
    """
    word2index = {word: i for i, word in enumerate(sorted(vocabulary.keys()))}
    return word2index

def create_index2word(word2index):
    """
    Creates a mapping from unique integer indices to words.

    Args:
        word2index (dict): A dictionary mapping words to their integer indices.

    Returns:
        dict: A dictionary mapping integer indices to words.
    """
    index2word = {index: word for word, index in word2index.items()}
    return index2word



In [None]:
# Example usage with the previously created vocabulary_dict
word2index = create_word2index(vocabulary_dict)
index2word = create_index2word(word2index)

print(f"\nWord2Index dictionary created. Sample: {list(word2index.items())[:5]}")
print(f"Index2Word dictionary created. Sample: {list(index2word.items())[:5]}")

# Test a word and its index
example_word = 'love'
if example_word in word2index:
    idx = word2index[example_word]
    retrieved_word = index2word[idx]
    print(f"\nTesting '{example_word}': Index = {idx}, Retrieved word = '{retrieved_word}'")
else:
    print(f"\nWord '{example_word}' not found in vocabulary.")



Word2Index dictionary created. Sample: [('1', 0), ('10', 1), ('100', 2), ('1000', 3), ('1004', 4)]
Index2Word dictionary created. Sample: [(0, '1'), (1, '10'), (2, '100'), (3, '1000'), (4, '1004')]

Testing 'love': Index = 13801, Retrieved word = 'love'


In [None]:
index2word[1801]

'anatomy'

In [None]:
index2word[100]

'146'

In [None]:
word2index['memory']

14412

In [None]:
# Add a special UNK token to the word2index and index2word dictionaries
# The index for 'unk' will be the next available index after the existing vocabulary

unk_word = '<unk>'
unk_index = len(vocabulary_dict) # Get the next available index

# Check if 'unk' is already in the vocabulary to avoid re-adding if the cell is run multiple times
if unk_word not in word2index:
    word2index[unk_word] = unk_index
    index2word[unk_index] = unk_word
    print(f"Added '{unk_word}' to vocabulary with index {unk_index}")
else:
    unk_index = word2index[unk_word]
    print(f"'{unk_word}' already exists in vocabulary with index {unk_index}")

def split_to_tokens(input_string, word2index, unk_token='<unk>'):
    """
    Splits an input string into words, converts them to lowercase, and then maps them
    to their corresponding numerical tokens using word2index. Words not found in
    word2index are mapped to the index of the unk_token.

    Args:
        input_string (str): The string to tokenize.
        word2index (dict): A dictionary mapping words to their integer indices.
        unk_token (str): The string representing the unknown token (e.g., 'unk').

    Returns:
        list: A list of integer tokens.
    """
    tokens = []
    # Ensure the unk_token exists in word2index
    if unk_token not in word2index:
        raise ValueError(f"UNK token '{unk_token}' not found in word2index. Please add it.")

    unk_idx = word2index[unk_token]
    words = re.findall(r'\b\w+\b', input_string.lower()) # Reuse regex for consistency

    for word in words:
        tokens.append(word2index.get(word, unk_idx))
    return tokens




Added '<unk>' to vocabulary with index 25750


In [None]:
# Example usage:
sentence1 = "Бути чи не бути - у цьому питання."
tokens1 = split_to_tokens(sentence1, word2index)
print(f"\nSentence: '{sentence1}'")
print(f"Tokens: {tokens1}")
print(f"Decoded: {[index2word[t] for t in tokens1]}")

sentence2 = "А врода юності більш не цвістиме."
tokens2 = split_to_tokens(sentence2, word2index)
print(f"\nSentence: '{sentence2}'")
print(f"Tokens: {tokens2}")
print(f"Decoded: {[index2word[t] for t in tokens2]}")

sentence3 = "Абсолютно нова фраза про глибинне навчання."
tokens3 = split_to_tokens(sentence3, word2index)
print(f"\nSentence: '{sentence3}'")
print(f"Tokens: {tokens3}")
print(f"Decoded: {[index2word[t] for t in tokens3]}")


Sentence: 'Бути чи не бути - у цьому питання.'
Tokens: [25750, 25750, 25750, 25750, 25750, 25750, 25750]
Decoded: ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']

Sentence: 'А врода юності більш не цвістиме.'
Tokens: [25750, 25750, 25750, 25750, 25750, 25750]
Decoded: ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']

Sentence: 'Абсолютно нова фраза про глибинне навчання.'
Tokens: [25750, 25750, 25750, 25750, 25750, 25750]
Decoded: ['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']


# Task
Prepare RNN training data by reading the 'sonnets.txt' file, tokenizing its entire content, and then generating input sequences and corresponding target tokens suitable for training a recurrent neural network.

## Prepare RNN Training Data

### Subtask:
Read the entire 'sonnets.txt' file, tokenize it, and then generate input sequences and corresponding target tokens suitable for training a recurrent neural network. This will involve creating sequences of a fixed length and preparing the target labels.


**Reasoning**:
First, I'll define the sequence length and read the content of `sonnets.txt`. Then, I'll tokenize the text using the previously defined `split_to_tokens` function to prepare for sequence generation.



In [None]:
SEQUENCE_LENGTH = 20

try:
    with open(target_file, 'r', encoding='utf-8') as f:
        text_content = f.read()
except FileNotFoundError:
    print(f"Error: The file '{target_file}' was not found.")
    text_content = ""

# Tokenize the entire text content
text_as_int = split_to_tokens(text_content, word2index, unk_token='<unk>')

print(f"Original text length (characters): {len(text_content)}")
print(f"Tokenized text length (tokens): {len(text_as_int)}")
print(f"First 10 tokens: {text_as_int[:10]}")
print(f"Decoded first 10 tokens: {[index2word[t] for t in text_as_int[:10]]}")

Original text length (characters): 5359350
Tokenized text length (tokens): 992904
First 10 tokens: [22517, 5325, 25498, 15671, 25287, 20131, 4020, 25287, 20131, 5673]
Decoded first 10 tokens: ['the', 'complete', 'works', 'of', 'william', 'shakespeare', 'by', 'william', 'shakespeare', 'contents']


**Reasoning**:
Now that the entire text is tokenized, I will iterate through the `text_as_int` list to create input sequences of `SEQUENCE_LENGTH` and their corresponding target tokens, as specified in the subtask instructions.



In [None]:
input_sequences = []
target_tokens = []

for i in range(len(text_as_int) - SEQUENCE_LENGTH):
    seq_in = text_as_int[i : i + SEQUENCE_LENGTH]
    seq_out = text_as_int[i + SEQUENCE_LENGTH]
    input_sequences.append(seq_in)
    target_tokens.append(seq_out)

print(f"Total number of sequences: {len(input_sequences)}")
print(f"Length of first input sequence: {len(input_sequences[0])}")
print(f"First input sequence: {input_sequences[0]}")
print(f"Decoded first input sequence: {[index2word[t] for t in input_sequences[0]]}")
print(f"First target token: {target_tokens[0]}")
print(f"Decoded first target token: {index2word[target_tokens[0]]}")


Total number of sequences: 992884
Length of first input sequence: 20
First input sequence: [22517, 5325, 25498, 15671, 25287, 20131, 4020, 25287, 20131, 5673, 22517, 20932, 1651, 19436, 25069, 22510, 8262, 25069, 22517, 22989]
Decoded first input sequence: ['the', 'complete', 'works', 'of', 'william', 'shakespeare', 'by', 'william', 'shakespeare', 'contents', 'the', 'sonnets', 'all', 's', 'well', 'that', 'ends', 'well', 'the', 'tragedy']
First target token: 15671
Decoded first target token: of


## Define and Compile Generic RNN Model

### Subtask:
Construct a basic RNN model using TensorFlow's Keras API. This model will typically include an Embedding layer, a SimpleRNN layer, and a Dense output layer with softmax activation for predicting the next word. The model will then be compiled with an optimizer and loss function.


**Reasoning**:
The subtask requires constructing and compiling a basic RNN model. This involves importing necessary Keras components, defining model parameters like vocabulary size and embedding dimensions, and then sequentially adding layers (Embedding, SimpleRNN, Dense) before compiling the model with an optimizer and loss function, all of which can be done in a single code block.



**Reasoning**:
The previous code block generated a UserWarning indicating that the `input_length` argument in the `Embedding` layer is deprecated. Removing this argument will align the code with current Keras practices and eliminate the warning.



**Reasoning**:
The previous model summary showed 0 trainable parameters because the model had not yet been built. To ensure the model is fully constructed and the summary reflects the correct parameter count, I need to explicitly build the model by providing the input shape to the `build` method. The input shape for the `Embedding` layer is `(None, SEQUENCE_LENGTH)`, where `None` allows for a flexible batch size.



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# 2. Determine vocab_size
vocab_size = len(word2index) # word2index already contains the <unk> token

# 3. Define embedding_dim
embedding_dim = 256 # Can be tuned, common values are 100, 256, 512

# 4. Create a Sequential model
model = Sequential()

# 5. Add an Embedding layer
model.add(Embedding(vocab_size, embedding_dim))

# 6. Add a SimpleRNN layer
rnn_units = 512 # Number of RNN units, can be tuned
model.add(SimpleRNN(rnn_units, return_sequences=False))

# 7. Add a Dense output layer
# The output layer should have vocab_size units for predicting the next word's probability
model.add(Dense(vocab_size, activation='softmax'))

# Explicitly build the model to see the parameter counts in the summary
# The input shape to the embedding layer will be (batch_size, SEQUENCE_LENGTH)
model.build(input_shape=(None, SEQUENCE_LENGTH))

# 8. Compile the model
# Using 'adam' optimizer and 'sparse_categorical_crossentropy' for integer-encoded targets
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print a summary of the model to verify its architecture
model.summary()
print(f"RNN model constructed and compiled with vocab_size={vocab_size}, embedding_dim={embedding_dim}, and rnn_units={rnn_units}")

RNN model constructed and compiled with vocab_size=25751, embedding_dim=256, and rnn_units=512


## Train the RNN Model

### Subtask:
Train the defined RNN model using the prepared sequences for a few epochs to demonstrate the training process.


## Summary:

### Data Analysis Key Findings

*   The `sonnets.txt` file was successfully processed, containing 96,216 characters, which were tokenized into 18,297 numerical tokens.
*   Using a sequence length of 20, 18,277 input sequences and corresponding target tokens were generated for RNN training. The first input sequence, when decoded, was confirmed to be `['the', 'sonnets', 'by', 'william', 'shakespeare', '<unk>', 'from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but']`.
*   A Sequential RNN model was constructed with an Embedding layer (vocab_size=18298, embedding\_dim=256), a SimpleRNN layer (rnn\_units=512), and a Dense output layer with softmax activation.
*   The model was successfully compiled using the 'adam' optimizer and 'sparse\_categorical\_crossentropy' loss function, with a total of 2,862,218 trainable parameters.
*   Explicitly calling `model.build(input_shape=(None, SEQUENCE_LENGTH))` was necessary to properly build the model and display accurate parameter counts in the model summary.

### Insights or Next Steps

*   The prepared `input_sequences` and `target_tokens` are now ready for training the defined RNN model.
*   Consider experimenting with different `SEQUENCE_LENGTH` values, `embedding_dim`, and `rnn_units` to optimize model performance, as well as exploring more advanced RNN architectures like GRUs or LSTMs.


# Task
**Train the RNN model** using the prepared `input_sequences` and `target_tokens` for a specified number of epochs, converting them into TensorFlow tensors or NumPy arrays before training.

## Train the RNN Model

### Subtask:
Train the defined RNN model using the prepared sequences (`input_sequences` and `target_tokens`). This will involve using the `model.fit()` method for a specified number of epochs. It will convert the lists of input sequences and target tokens into TensorFlow tensors or NumPy arrays before training.


**Reasoning**:
To train the RNN model, I need to convert the `input_sequences` and `target_tokens` into NumPy arrays, define the number of epochs, and then use the `model.fit()` method.



In [None]:
import numpy as np

# Convert lists to NumPy arrays
X = np.array(input_sequences)
y = np.array(target_tokens)



(992884, 20)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
# Using a common split ratio, e.g., 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (794307, 20)
y_train shape: (794307,)
X_test shape: (198577, 20)
y_test shape: (198577,)


In [None]:
# Define the number of training epochs and batch size
epochs = 25  # You can adjust this value
batch_size = 64 # You can adjust this value

print(f"Training model for {epochs} epochs with batch size {batch_size}...")

# Train the model, including validation data
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(X_test, y_test))

print("Model training complete.")

Training model for 25 epochs with batch size 64...
Epoch 1/25
[1m12412/12412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 12ms/step - accuracy: 0.0471 - loss: 7.3005 - val_accuracy: 0.0772 - val_loss: 6.7106
Epoch 2/25
[1m12412/12412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 11ms/step - accuracy: 0.0766 - loss: 6.6017 - val_accuracy: 0.0755 - val_loss: 6.7923
Epoch 3/25
[1m12412/12412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 11ms/step - accuracy: 0.0765 - loss: 6.5671 - val_accuracy: 0.0747 - val_loss: 6.8460
Epoch 4/25
[1m12412/12412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 11ms/step - accuracy: 0.0739 - loss: 6.5792 - val_accuracy: 0.0680 - val_loss: 6.9467
Epoch 5/25
[1m12412/12412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 11ms/step - accuracy: 0.0712 - loss: 6.6018 - val_accuracy: 0.0666 - val_loss: 6.9832
Epoch 6/25
[1m12412/12412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 11ms/step - accuracy:

In [None]:
X_w = np.array([[index2word[token_id] for token_id in seq] for seq in X])
y_w = np.array([index2word[token_id] for token_id in y])

print(f"First 6 word sequences from X_w:\n{X_w[:6]}")
print(f"First 6 target words from y_w:\n{y_w[:6]}")

First 6 word sequences from X_w:
[['the' 'complete' 'works' 'of' 'william' 'shakespeare' 'by' 'william'
  'shakespeare' 'contents' 'the' 'sonnets' 'all' 's' 'well' 'that' 'ends'
  'well' 'the' 'tragedy']
 ['complete' 'works' 'of' 'william' 'shakespeare' 'by' 'william'
  'shakespeare' 'contents' 'the' 'sonnets' 'all' 's' 'well' 'that' 'ends'
  'well' 'the' 'tragedy' 'of']
 ['works' 'of' 'william' 'shakespeare' 'by' 'william' 'shakespeare'
  'contents' 'the' 'sonnets' 'all' 's' 'well' 'that' 'ends' 'well' 'the'
  'tragedy' 'of' 'antony']
 ['of' 'william' 'shakespeare' 'by' 'william' 'shakespeare' 'contents'
  'the' 'sonnets' 'all' 's' 'well' 'that' 'ends' 'well' 'the' 'tragedy'
  'of' 'antony' 'and']
 ['william' 'shakespeare' 'by' 'william' 'shakespeare' 'contents' 'the'
  'sonnets' 'all' 's' 'well' 'that' 'ends' 'well' 'the' 'tragedy' 'of'
  'antony' 'and' 'cleopatra']
 ['shakespeare' 'by' 'william' 'shakespeare' 'contents' 'the' 'sonnets'
  'all' 's' 'well' 'that' 'ends' 'well' 'the' '

In [None]:
embedding_layer_weights = model.layers[0].get_weights()[0]

# Get the embedding vectors for a few example words
example_words = ['the', 'love', 'my', '<unk>'] # Include '<unk>' as it's part of the vocabulary

print("Dense vectors (embeddings) for example words:")
for word in example_words:
    if word in word2index:
        word_idx = word2index[word]
        embedding_vector = embedding_layer_weights[word_idx]
        print(f"Embedding for '{word}' (index {word_idx}):\n{embedding_vector[:10]}... (first 10 dimensions)\n")
    else:
        print(f"Warning: '{word}' not found in word2index.")

print(f"Shape of the full embedding matrix: {embedding_layer_weights.shape}")

Dense vectors (embeddings) for example words:
Embedding for 'the' (index 22517):
[-0.24563281  0.15910712  0.356715   -0.33681467 -0.15767288 -1.0483623
 -0.36054176 -0.17794757 -0.6443147   0.6904135 ]... (first 10 dimensions)

Embedding for 'love' (index 13801):
[ 0.07236487 -0.02925708 -0.8758572  -0.05929389  0.39786392 -0.20800595
 -0.12630692 -0.44916284  1.114932    0.21612057]... (first 10 dimensions)

Embedding for 'my' (index 15146):
[-0.13908058 -1.2326882   1.2233733   0.4714669  -0.44261453  0.4388383
 -0.15082264 -0.43022165 -0.16954488 -0.76495725]... (first 10 dimensions)

Embedding for '<unk>' (index 25750):
[ 0.01976272 -0.03745591 -0.04290277 -0.04686587 -0.01097163 -0.02025989
 -0.01092551  0.03548136  0.00417491  0.02820674]... (first 10 dimensions)

Shape of the full embedding matrix: (25751, 256)


In [None]:
def predict_next_word(model, input_text, word2index, index2word, sequence_length, unk_token='<unk>'):
    """
    Predicts the next word given an input text using the trained RNN model.

    Args:
        model (tf.keras.Model): The trained RNN model.
        input_text (str): The text prompt to generate the next word from.
        word2index (dict): Dictionary mapping words to their integer indices.
        index2word (dict): Dictionary mapping integer indices to words.
        sequence_length (int): The fixed length of input sequences for the model.
        unk_token (str): The string representing the unknown token.

    Returns:
        str: The predicted next word.
    """
    # 1. Tokenize the input string
    tokens = split_to_tokens(input_text, word2index, unk_token=unk_token)

    # 2. Pad or truncate the tokens to match sequence_length
    if len(tokens) > sequence_length:
        tokens = tokens[-sequence_length:] # Take the last 'sequence_length' tokens
    elif len(tokens) < sequence_length:
        # Pad with the UNK token index if shorter
        unk_idx = word2index.get(unk_token, 0) # Default to 0 if unk not found (shouldn't happen)
        tokens = [unk_idx] * (sequence_length - len(tokens)) + tokens

    # 3. Convert to numpy array and add batch dimension
    input_array = np.array(tokens).reshape(1, sequence_length)

    # 4. Model prediction
    predicted_probabilities = model.predict(input_array, verbose=0)[0]

    # 5. Get the index of the word with the highest probability
    predicted_index = np.argmax(predicted_probabilities)

    # 6. Convert the index back to a word
    predicted_word = index2word.get(predicted_index, unk_token)

    return predicted_word



In [None]:
input_string = """Then are we all undone.
It is not possible, it cannot be,
The King should keep his word in loving us;
He will suspect us still, and find a time
To punish this offence in other faults.
Suspicion all our lives shall be stuck full of eyes,
For treason is but trusted like the fox,
Who, ne’er so tame, so cherish’d and lock’d up,
Will have a wild trick of his ancestors.
Look how we can, or sad or merrily,
Interpretation will misquote our looks,

"""

# Generate the next word
next_word = predict_next_word(model, input_string, word2index, index2word, SEQUENCE_LENGTH, unk_token='<unk>')

print(f"Input text:\n{input_string}")
print(f"Predicted next word: '{next_word}'")

Input text:
Then are we all undone.
It is not possible, it cannot be,
The King should keep his word in loving us;
He will suspect us still, and find a time
To punish this offence in other faults.
Suspicion all our lives shall be stuck full of eyes,
For treason is but trusted like the fox,
Who, ne’er so tame, so cherish’d and lock’d up,
Will have a wild trick of his ancestors.
Look how we can, or sad or merrily,
Interpretation will misquote our looks,


Predicted next word: 'i'


In [None]:
def generate_k_words(model, initial_prompt, word2index, index2word, sequence_length, num_words_to_generate, unk_token='<unk>'):
    """
    Generates a sequence of k words based on an initial prompt.

    Args:
        model (tf.keras.Model): The trained RNN model.
        initial_prompt (str): The starting text prompt.
        word2index (dict): Dictionary mapping words to their integer indices.
        index2word (dict): Dictionary mapping integer indices to words.
        sequence_length (int): The fixed length of input sequences for the model.
        num_words_to_generate (int): The number of words to generate.
        unk_token (str): The string representing the unknown token.

    Returns:
        str: The generated text, including the initial prompt and the predicted words.
    """
    generated_text = initial_prompt
    current_prompt = initial_prompt # This will be updated to pass to predict_next_word

    for _ in range(num_words_to_generate):
        # predict_next_word already handles truncation to sequence_length
        next_word = predict_next_word(model, current_prompt, word2index, index2word, sequence_length, unk_token)
        generated_text += " " + next_word # Append the predicted word

        # Update current_prompt for the next iteration by taking the last words of generated_text
        # This ensures the input to predict_next_word maintains the context up to SEQUENCE_LENGTH
        current_prompt = " ".join(generated_text.split()[-sequence_length:])

    return generated_text



In [None]:

# Example usage:
input_string = """Then are we all undone.
It is not possible, it cannot be,
The King should keep his word in loving us;
He will suspect us still, and find a time
To punish this offence in other faults.
Suspicion all our lives shall be stuck full of eyes,
For treason is but trusted like the fox,

"""
k_words_to_generate = 20

print(f"Generating {k_words_to_generate} words based on the prompt:")
print(input_string)

full_generated_text = generate_k_words(
    model,
    input_string,
    word2index,
    index2word,
    SEQUENCE_LENGTH,
    k_words_to_generate,
    unk_token='<unk>'
)

print("\nGenerated text:")
print(full_generated_text)

Generating 20 words based on the prompt:
Then are we all undone.
It is not possible, it cannot be,
The King should keep his word in loving us;
He will suspect us still, and find a time
To punish this offence in other faults.
Suspicion all our lives shall be stuck full of eyes,
For treason is but trusted like the fox,



Generated text:
Then are we all undone.
It is not possible, it cannot be,
The King should keep his word in loving us;
He will suspect us still, and find a time
To punish this offence in other faults.
Suspicion all our lives shall be stuck full of eyes,
For treason is but trusted like the fox,

 of the world of his kingdom and nourish the mere of a maid and jewels in her _ fiends and


In [None]:
# Example usage:
input_string = """
Tailor Google News to your interests and habits so you can quickly catch up on the news. Sign in. Learn more. Unlock the news you care about.
"""
k_words_to_generate = 50

print(f"Generating {k_words_to_generate} words based on the prompt:")
print(input_string)

full_generated_text = generate_k_words(
    model,
    input_string,
    word2index,
    index2word,
    SEQUENCE_LENGTH,
    k_words_to_generate,
    unk_token='<unk>'
)

print("\nGenerated text:")
print(full_generated_text)

Generating 50 words based on the prompt:

Tailor Google News to your interests and habits so you can quickly catch up on the news. Sign in. Learn more. Unlock the news you care about.


Generated text:

Tailor Google News to your interests and habits so you can quickly catch up on the news. Sign in. Learn more. Unlock the news you care about.
 to you and the world of his world and his prophecies to dover and amen to t into the ewer of the world of his kingdom and nourish the ewer of the world and his prophecies to dover bestow thorns of york and soldiers gardeners my lord alexas signior is


З курсу Л. Мороні

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = []
try:
    with open('sonnets.txt', 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line: # Only add non-empty lines
                sentences.append(line)
except FileNotFoundError:
    print("Error: 'sonnets.txt' not found.")

tokenizer = Tokenizer(num_words = 10000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'і': 2, '—': 3, 'в': 4, 'не': 5, 'я': 6, 'ти': 7, 'що': 8, 'на': 9, 'й': 10, 'як': 11, 'та': 12, 'у': 13, 'до': 14, 'з': 15, 'мене': 16, 'тебе': 17, 'мені': 18, 'а': 19, 'ж': 20, 'мій': 21, 'за': 22, 'щоб': 23, 'любов': 24, 'коли': 25, 'від': 26, 'чи': 27, 'так': 28, 'б': 29, 'то': 30, 'тобі': 31, 'твій': 32, 'це': 33, 'вже': 34, 'моїх': 35, 'моя': 36, 'без': 37, 'вона': 38, 'для': 39, 'твоя': 40, 'твоїх': 41, 'час': 42, 'хто': 43, 'він': 44, 'бо': 45, 'лиш': 46, 'красу': 47, 'хай': 48, 'сам': 49, 'все': 50, 'знов': 51, 'про': 52, 'із': 53, 'краси': 54, 'їх': 55, 'зір': 56, 'всі': 57, 'твоєї': 58, 'них': 59, 'о': 60, 'ми': 61, 'твоє': 62, 'мої': 63, 'слова': 64, 'його': 65, 'ні': 66, 'її': 67, 'нехай': 68, 'серце': 69, 'по': 70, 'світ': 71, 'твого': 72, 'те': 73, 'дні': 74, 'той': 75, 'твою': 76, 'міг': 77, 'хоч': 78, 'гріх': 79, 'ті': 80, 'тоді': 81, 'себе': 82, 'серця': 83, 'вони': 84, 'нам': 85, 'життя': 86, 'над': 87, 'де': 88, 'колись': 89, 'там': 90, 'тільки': 91, 'д

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
sequences[:20]

[[406],
 [60, 1246, 1247, 1248],
 [22, 1249, 46, 407, 226],
 [47, 1250, 408, 1, 1251, 36, 1252, 2],
 [296, 1253, 630, 297],
 [18, 6, 3, 123, 298, 1254, 36, 409],
 [410, 92, 1255, 3, 171],
 [1, 14, 631, 227, 1256, 1257],
 [299, 632, 633, 172],
 [1258, 1259, 634],
 [1260, 228, 1261, 1262],
 [39, 1263, 173, 3, 1264],
 [635, 85, 300, 1265],
 [86, 411, 1266, 1, 194, 53],
 [636, 1267, 637, 1268],
 [1269],
 [10, 1270, 1271, 1272, 9, 1273],
 [153, 61, 1274, 1275],
 [18, 174, 412, 301, 4, 297],
 [1, 33, 1276, 1277, 1278, 2]]

In [None]:
text = tokenizer.sequences_to_texts(sequences[:20])

In [None]:
text

['1',
 'ми пагінці плекаємо кущів',
 'щоб відновить красу троянди ними',
 'хай кущ старий і всохне без дощів —',
 'весною пагін молодий цвістиме',
 'а ти в свій блиск закоханий без меж',
 'марнуєш скарб позичений в природи',
 'і з голоду серед достатку мреш',
 'жорстокий вороже своєї вроди',
 'провіснику весняної снаги',
 'коротких днів окрасо нетривала',
 'твоя самітність нині в ланцюги',
 'потік життя безжально закувала',
 'над світом змилуйся і дар краси',
 'новим вікам назустріч понеси',
 '2',
 'як сорок зим суворі й невмолимі',
 'чоло твоє поріжуть молоде',
 'а врода юності більш не цвістиме',
 'і вже зів’ялим листом опаде —']

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
pad_sequences?