## A. Dataset Loading & Preprocessing

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from datasets import load_dataset
import re
import string
import time
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [17]:
import urllib.request
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
urllib.request.urlretrieve(url, 'shakespeare.txt')
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
len(raw_text)

1115394

In [18]:
text = raw_text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
text = re.sub(r'\d+', '', text)
text = re.sub(r'\s+', ' ', text).strip()
len(text)

1053696

## Tokenization & Sequence Creation

In [19]:
words = text.split()
unique_words = sorted(set(words))
vocab_size = len(unique_words)

word_to_index = {word: idx for idx, word in enumerate(unique_words)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

encoded_text = [word_to_index[word] for word in words]

f'Vocabulary size: {vocab_size}'

'Vocabulary size: 12847'

In [20]:
sequence_length = 10

X = []
y = []

for i in range(len(encoded_text) - sequence_length):
    X.append(encoded_text[i:i + sequence_length])
    y.append(encoded_text[i + sequence_length])

X = np.array(X)
y = np.array(y)

X.shape, y.shape

((202609, 10), (202609,))

In [21]:
split_ratio = 0.8
split_idx = int(len(X) * split_ratio)

X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

f'Training set: {X_train.shape}, Validation set: {X_val.shape}'

'Training set: (162087, 10), Validation set: (40522, 10)'

## B. LSTM-based Language Model

In [22]:
lstm_model = Sequential([
    Embedding(vocab_size, 128, input_length=sequence_length),
    LSTM(256, return_sequences=False),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.build((None, sequence_length))
lstm_model.summary()

In [None]:
lstm_start_time = time.time()
lstm_history = lstm_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=25, batch_size=64, verbose=0)
lstm_training_time = time.time() - lstm_start_time

f'LSTM Training Time: {lstm_training_time:.2f} seconds'

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(lstm_history.history['loss'], label='Training Loss')
plt.plot(lstm_history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('LSTM Model - Loss Curves')
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(lstm_history.history['accuracy'], label='Training Accuracy')
plt.plot(lstm_history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('LSTM Model - Accuracy Curves')
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

In [None]:
lstm_val_predictions = lstm_model.predict(X_val, verbose=0)
lstm_val_loss = np.mean(-np.log(np.max(lstm_val_predictions, axis=1) + 1e-10))
lstm_perplexity = np.exp(lstm_val_loss)

f'LSTM Validation Loss: {lstm_val_loss:.4f}'
f'LSTM Perplexity: {lstm_perplexity:.4f}'

In [None]:
def generate_text_lstm(start_phrase, num_words=8, temperature=0.7):
    words = start_phrase.lower().split()
    
    for _ in range(num_words):
        input_words = words[-sequence_length:]
        
        while len(input_words) < sequence_length:
            input_words = [input_words[0]] + input_words
        
        encoded_input = np.array([[word_to_index.get(word, 0) for word in input_words]])
        
        predictions = lstm_model.predict(encoded_input, verbose=0)[0]
        predictions = np.power(predictions, 1/temperature)
        predictions = predictions / np.sum(predictions)
        
        next_word_idx = np.random.choice(len(predictions), p=predictions)
        next_word = index_to_word.get(next_word_idx, 'unknown')
        words.append(next_word)
    
    return ' '.join(words)

## C. GRU-based Language Model

In [None]:
gru_model = Sequential([
    Embedding(vocab_size, 128, input_length=sequence_length),
    GRU(256, return_sequences=False),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

gru_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
gru_model.build((None, sequence_length))
gru_model.summary()

In [None]:
gru_start_time = time.time()
gru_history = gru_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=25, batch_size=64, verbose=0)
gru_training_time = time.time() - gru_start_time

f'GRU Training Time: {gru_training_time:.2f} seconds'

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(gru_history.history['loss'], label='Training Loss')
plt.plot(gru_history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('GRU Model - Loss Curves')
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(gru_history.history['accuracy'], label='Training Accuracy')
plt.plot(gru_history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('GRU Model - Accuracy Curves')
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

In [None]:
gru_val_predictions = gru_model.predict(X_val, verbose=0)
gru_val_loss = np.mean(-np.log(np.max(gru_val_predictions, axis=1) + 1e-10))
gru_perplexity = np.exp(gru_val_loss)

f'GRU Validation Loss: {gru_val_loss:.4f}'
f'GRU Perplexity: {gru_perplexity:.4f}'

In [None]:
def generate_text_gru(start_phrase, num_words=8, temperature=0.7):
    words = start_phrase.lower().split()
    
    for _ in range(num_words):
        input_words = words[-sequence_length:]
        
        while len(input_words) < sequence_length:
            input_words = [input_words[0]] + input_words
        
        encoded_input = np.array([[word_to_index.get(word, 0) for word in input_words]])
        
        predictions = gru_model.predict(encoded_input, verbose=0)[0]
        predictions = np.power(predictions, 1/temperature)
        predictions = predictions / np.sum(predictions)
        
        next_word_idx = np.random.choice(len(predictions), p=predictions)
        next_word = index_to_word.get(next_word_idx, 'unknown')
        words.append(next_word)
    
    return ' '.join(words)

## D. Comparison & Analysis

In [None]:
lstm_params = lstm_model.count_params()
gru_params = gru_model.count_params()

lstm_final_val_loss = lstm_history.history['val_loss'][-1]
gru_final_val_loss = gru_history.history['val_loss'][-1]

lstm_final_accuracy = lstm_history.history['val_accuracy'][-1]
gru_final_accuracy = gru_history.history['val_accuracy'][-1]

comparison_df = pd.DataFrame({
    'Metric': ['Training Time (seconds)', 'Model Parameters', 'Final Validation Loss', 'Final Validation Accuracy', 'Perplexity'],
    'LSTM': [f'{lstm_training_time:.2f}', lstm_params, f'{lstm_final_val_loss:.4f}', f'{lstm_final_accuracy:.4f}', f'{lstm_perplexity:.4f}'],
    'GRU': [f'{gru_training_time:.2f}', gru_params, f'{gru_final_val_loss:.4f}', f'{gru_final_accuracy:.4f}', f'{gru_perplexity:.4f}']
})

comparison_df

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].plot(lstm_history.history['val_loss'], label='LSTM', linewidth=2)
axes[0, 0].plot(gru_history.history['val_loss'], label='GRU', linewidth=2)
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Validation Loss')
axes[0, 0].set_title('Validation Loss Comparison')
axes[0, 0].legend()
axes[0, 0].grid()

axes[0, 1].plot(lstm_history.history['val_accuracy'], label='LSTM', linewidth=2)
axes[0, 1].plot(gru_history.history['val_accuracy'], label='GRU', linewidth=2)
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Validation Accuracy')
axes[0, 1].set_title('Validation Accuracy Comparison')
axes[0, 1].legend()
axes[0, 1].grid()

models = ['LSTM', 'GRU']
training_times = [lstm_training_time, gru_training_time]
axes[1, 0].bar(models, training_times, color=['blue', 'orange'])
axes[1, 0].set_ylabel('Time (seconds)')
axes[1, 0].set_title('Training Time Comparison')
axes[1, 0].grid(axis='y')

perplexities = [lstm_perplexity, gru_perplexity]
axes[1, 1].bar(models, perplexities, color=['blue', 'orange'])
axes[1, 1].set_ylabel('Perplexity')
axes[1, 1].set_title('Perplexity Comparison')
axes[1, 1].grid(axis='y')

plt.tight_layout()
plt.show()

## Text Generation Examples

In [None]:
test_prompts = ['the king', 'love is', 'we must']

lstm_samples = []
gru_samples = []

for prompt in test_prompts:
    lstm_text = generate_text_lstm(prompt, num_words=10, temperature=0.8)
    gru_text = generate_text_gru(prompt, num_words=10, temperature=0.8)
    lstm_samples.append(lstm_text)
    gru_samples.append(gru_text)

results_df = pd.DataFrame({
    'Prompt': test_prompts,
    'LSTM Generated': lstm_samples,
    'GRU Generated': gru_samples
})

results_df

## Analysis & Conclusions

In [None]:
analysis = f"""
LSTM vs GRU ANALYSIS:

1. TRAINING TIME:
   LSTM: {lstm_training_time:.2f}s
   GRU: {gru_training_time:.2f}s
   Faster Model: {'GRU' if gru_training_time < lstm_training_time else 'LSTM'}

2. MODEL COMPLEXITY:
   LSTM Parameters: {lstm_params:,}
   GRU Parameters: {gru_params:,}
   Simpler Model: {'GRU' if gru_params < lstm_params else 'LSTM'} ({abs(lstm_params - gru_params):,} fewer params)

3. VALIDATION LOSS:
   LSTM: {lstm_final_val_loss:.4f}
   GRU: {gru_final_val_loss:.4f}
   Better: {'GRU' if gru_final_val_loss < lstm_final_val_loss else 'LSTM'}

4. VALIDATION ACCURACY:
   LSTM: {lstm_final_accuracy:.4f}
   GRU: {gru_final_accuracy:.4f}
   Better: {'GRU' if gru_final_accuracy > lstm_final_accuracy else 'LSTM'}

5. PERPLEXITY:
   LSTM: {lstm_perplexity:.4f}
   GRU: {gru_perplexity:.4f}
   Better: {'GRU' if gru_perplexity < lstm_perplexity else 'LSTM'} (lower is better)

CONCLUSION:
{'GRU outperforms LSTM' if gru_final_val_loss < lstm_final_val_loss else 'LSTM outperforms GRU'} on the Tiny Shakespeare dataset.
The GRU model is more computationally efficient with fewer parameters while maintaining comparable performance.
Both models successfully learn next-word prediction patterns from Shakespeare text.
"""

analysis