In [1]:
import torch
import torch.nn as nn
import math
import nltk
from nltk.corpus import gutenberg

In [2]:
# Download the Gutenberg corpus if not already downloaded
try:
    gutenberg.fileids()
except LookupError:
    nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /Users/surya/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [3]:
# Load the complete works of Shakespeare and preprocess
def create_shakespeare_dataset():
    all_words = []
    for fileid in gutenberg.fileids():
        if fileid.startswith("shakespeare-"): #Correct way to load Shakespeare texts
            words = [word.lower() for word in nltk.word_tokenize(gutenberg.raw(fileid)) if word.isalnum()]
            all_words.extend(words)
    return all_words

In [4]:
# Create the dataset
dataset = create_shakespeare_dataset()

In [5]:
dataset

['the',
 'tragedie',
 'of',
 'julius',
 'caesar',
 'by',
 'william',
 'shakespeare',
 '1599',
 'actus',
 'primus',
 'scoena',
 'prima',
 'enter',
 'flauius',
 'murellus',
 'and',
 'certaine',
 'commoners',
 'ouer',
 'the',
 'stage',
 'flauius',
 'hence',
 'home',
 'you',
 'idle',
 'creatures',
 'get',
 'you',
 'home',
 'is',
 'this',
 'a',
 'holiday',
 'what',
 'know',
 'you',
 'not',
 'being',
 'mechanicall',
 'you',
 'ought',
 'not',
 'walke',
 'vpon',
 'a',
 'labouring',
 'day',
 'without',
 'the',
 'signe',
 'of',
 'your',
 'profession',
 'speake',
 'what',
 'trade',
 'art',
 'thou',
 'car',
 'why',
 'sir',
 'a',
 'carpenter',
 'mur',
 'where',
 'is',
 'thy',
 'leather',
 'apron',
 'and',
 'thy',
 'rule',
 'what',
 'dost',
 'thou',
 'with',
 'thy',
 'best',
 'apparrell',
 'on',
 'you',
 'sir',
 'what',
 'trade',
 'are',
 'you',
 'cobl',
 'truely',
 'sir',
 'in',
 'respect',
 'of',
 'a',
 'fine',
 'workman',
 'i',
 'am',
 'but',
 'as',
 'you',
 'would',
 'say',
 'a',
 'cobler',
 'mu

In [6]:
len(dataset)

67042

In [7]:
# Create a vocabulary and word-to-index mapping
vocab = sorted(list(set(dataset)))
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}
vocab_size = len(vocab)

In [8]:
vocab

['1',
 '1599',
 '1603',
 '2',
 '3',
 '4',
 'a',
 'abhominably',
 'abhorred',
 'abide',
 'abilitie',
 'abiure',
 'abler',
 'aboord',
 'aboue',
 'abound',
 'about',
 'abridgements',
 'abroad',
 'absence',
 'absent',
 'absolute',
 'abstinence',
 'abstracts',
 'absurd',
 'abus',
 'abuse',
 'abuses',
 'accent',
 'accents',
 'accepts',
 'accesse',
 'accident',
 'accidentall',
 'accompany',
 'accompt',
 'accord',
 'according',
 'account',
 'accounted',
 'accoutred',
 'accursed',
 'accurst',
 'accuse',
 'accust',
 'accustom',
 'acheron',
 'acquaint',
 'acquainted',
 'acquire',
 'acquittance',
 'acrosse',
 'act',
 'acte',
 'acted',
 'acting',
 'action',
 'actions',
 'actiuely',
 'actor',
 'actors',
 'acts',
 'actuall',
 'actus',
 'adaies',
 'adam',
 'adams',
 'addage',
 'adde',
 'added',
 'adder',
 'adders',
 'addicted',
 'addition',
 'addresse',
 'addrest',
 'adhere',
 'adheres',
 'adieu',
 'adiew',
 'adioyn',
 'admir',
 'admirable',
 'admiration',
 'admit',
 'admittance',
 'adoption',
 'adore

In [9]:
vocab_size

7319

In [10]:
# Example hyperparameters (adjust as needed)
embedding_dim = 512
sequence_length = 10  # Example sequence length

In [11]:
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(WordEmbedding, self).__init__()
        # Create an embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, x):
        # Look up the embedding for each index in the input
        return self.embedding(x)


In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Create a positional encoding matrix with shape (max_len, embedding_dim)
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)  # Apply sin to even indices in the embedding
        pe[:, 1::2] = torch.cos(position * div_term)  # Apply cos to odd indices in the embedding
        pe = pe.unsqueeze(1)  # Add a batch dimension
        self.register_buffer('pe', pe)  # Register pe as a buffer, so it is not updated during training

    def forward(self, x):
        # x: (seq_len, batch_size, embedding_dim)
        # Add the positional encoding to the input embeddings
        x = x + self.pe[:x.size(0), :]  # Only add the positional encodings up to the input length
        return x


In [13]:
class LearnedPositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=5000):
        super(LearnedPositionalEncoding, self).__init__()
        # Create an embedding layer to learn positional encodings
        self.embedding = nn.Embedding(max_len, embedding_dim)

    def forward(self, x):
        # Get the position indices (0, 1, 2, ..., seq_len-1) for the input sequence
        position = torch.arange(0, x.size(0), dtype=torch.long, device=x.device).unsqueeze(1)
        # Add the learned positional encodings to the input embeddings
        x = x + self.embedding(position)
        return x


In [14]:
# --- Example Usage with a Single Sequence ---
# Take the first 'sequence_length' words as an example sequence
example_sequence_words = dataset[:sequence_length]
example_sequence_indices = [word_to_index[word] for word in example_sequence_words]
input_sequence = torch.tensor(example_sequence_indices).unsqueeze(1)  # Shape (seq_len, 1) - Single Batch


In [15]:
# Word Embedding
word_embedding = WordEmbedding(vocab_size, embedding_dim)
embedded_sequence = word_embedding(input_sequence)

In [16]:
word_embedding

WordEmbedding(
  (embedding): Embedding(7319, 512)
)

In [17]:
embedded_sequence

tensor([[[ 0.5109, -0.7915, -0.3427,  ..., -0.4239, -0.0114,  1.1026]],

        [[ 1.2033, -1.1024, -1.2348,  ...,  2.5744,  1.2959,  0.2264]],

        [[ 1.6201, -0.2408,  0.5797,  ...,  0.4017,  0.2863, -0.0556]],

        ...,

        [[-0.0035,  1.4774, -1.1439,  ...,  1.2800, -0.4082, -1.2408]],

        [[-0.1194, -1.2127,  0.4956,  ..., -1.4009, -1.8121, -0.8494]],

        [[-0.0925, -0.1867,  0.7836,  ..., -0.2156, -0.1570, -0.8315]]],
       grad_fn=<EmbeddingBackward0>)

In [18]:
embedded_sequence.shape

torch.Size([10, 1, 512])

In [19]:
# Sinusoidal Positional Encoding
positional_encoding = PositionalEncoding(embedding_dim)
encoded_sequence_sin = positional_encoding(embedded_sequence)

In [21]:
encoded_sequence_sin

tensor([[[ 0.5109,  0.2085, -0.3427,  ...,  0.5761, -0.0114,  2.1026]],

        [[ 2.0448, -0.5621, -0.4129,  ...,  3.5744,  1.2960,  1.2264]],

        [[ 2.5294, -0.6569,  1.5161,  ...,  1.4017,  0.2865,  0.9444]],

        ...,

        [[ 0.6535,  2.2313, -0.6915,  ...,  2.2800, -0.4075, -0.2408]],

        [[ 0.8700, -1.3582,  1.4863,  ..., -0.4009, -1.8113,  0.1506]],

        [[ 0.3196, -1.0978,  1.4600,  ...,  0.7844, -0.1561,  0.1685]]],
       grad_fn=<AddBackward0>)

In [22]:
encoded_sequence_sin.shape

torch.Size([10, 1, 512])

In [23]:
# Learned Positional Encoding
learned_positional_encoding = LearnedPositionalEncoding(embedding_dim)
encoded_sequence_learned = learned_positional_encoding(embedded_sequence)

In [24]:
encoded_sequence_learned

tensor([[[ 0.5274, -1.2473, -0.1996,  ..., -0.6185, -0.6949,  1.1423]],

        [[ 1.8848, -1.4726,  1.6219,  ...,  4.0031,  0.6432,  0.0667]],

        [[ 1.4360,  2.0776,  0.6666,  ...,  0.9790,  0.4055,  0.3919]],

        ...,

        [[ 0.9970, -0.5662, -0.0800,  ..., -0.2291,  1.1091, -1.0232]],

        [[-0.4327, -1.4443, -1.1171,  ..., -2.0791, -4.4312, -1.3945]],

        [[-1.1799,  0.3968,  0.6103,  ..., -0.3741, -0.0804, -0.6231]]],
       grad_fn=<AddBackward0>)

In [25]:
encoded_sequence_learned.shape

torch.Size([10, 1, 512])

In [26]:
print("Input Sequence Shape:", input_sequence.shape)
print("Embedded Sequence Shape:", embedded_sequence.shape)
print("Encoded Sequence (Sinusoidal) Shape:", encoded_sequence_sin.shape)
print("Encoded Sequence (Learned) Shape:", encoded_sequence_learned.shape)


Input Sequence Shape: torch.Size([10, 1])
Embedded Sequence Shape: torch.Size([10, 1, 512])
Encoded Sequence (Sinusoidal) Shape: torch.Size([10, 1, 512])
Encoded Sequence (Learned) Shape: torch.Size([10, 1, 512])


In [27]:
# Verify the positional encodings are different for different positions
print("\nVerifying Positional Encodings:")

# Sinusoidal
print("Sinusoidal PE for position 0:\n", positional_encoding.pe[:1, 0, :5])
print("Sinusoidal PE for position 1:\n", positional_encoding.pe[1:2, 0, :5])

# Learned
print("Learned PE for position 0:\n", learned_positional_encoding.embedding(torch.tensor([0])).detach().numpy()[:5])
print("Learned PE for position 1:\n", learned_positional_encoding.embedding(torch.tensor([1])).detach().numpy()[:5])

print("\nVocabulary Size:", vocab_size)

print("\nExample Input Sequence (Words):", example_sequence_words)
print("Example Input Sequence (Indices):", example_sequence_indices)


Verifying Positional Encodings:
Sinusoidal PE for position 0:
 tensor([[0., 1., 0., 1., 0.]])
Sinusoidal PE for position 1:
 tensor([[0.8415, 0.5403, 0.8219, 0.5697, 0.8020]])
Learned PE for position 0:
 [[ 1.65731534e-02 -4.55783874e-01  1.43099353e-01  3.69202763e-01
   1.24040949e+00  8.61730054e-02 -2.07117486e+00  8.82206380e-01
   1.22090900e+00 -4.27254707e-01 -1.36042908e-01  1.16018236e+00
  -3.45689863e-01 -1.59403726e-01 -1.25040090e+00 -3.69148880e-01
   2.60806352e-01  7.64791429e-01 -5.96708894e-01  4.34388638e-01
   2.69163460e-01  9.73690271e-01  1.40994883e+00 -1.16771746e+00
  -1.12309384e+00 -8.70886892e-02  2.88274795e-01  5.79026163e-01
   2.03697944e+00 -4.90067005e-01 -1.48748267e+00  1.39848423e+00
  -9.46574271e-01  1.60183012e+00  2.00410083e-01 -1.71269691e+00
  -1.30482507e+00 -8.08316693e-02  1.49743736e+00  5.52731812e-01
   1.34137428e+00  2.99143672e-01  1.33821452e+00  1.35253334e+00
  -4.68201667e-01  1.34804499e+00  4.29636329e-01 -3.59175764e-02
  -