In [None]:
import torch
import torch.nn as nn
import tiktoken

In [2]:
data_path = "dataset/data.txt"

with open(data_path,"r") as handle:
    data = handle.read()

In [3]:
n_features = 5

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")
encoded_data = tokenizer.encode(data)
len(encoded_data)

41921

In [5]:
data  = []
targets = []

for i in range(0, len(encoded_data), n_features):
    if i + n_features >= len(encoded_data):
        break
    
    data.append(encoded_data[i:i+n_features])
    targets.append(encoded_data[i+n_features])

data = torch.tensor(data)
targets = torch.tensor(targets)

In [6]:
data.shape, targets.shape

(torch.Size([8384, 5]), torch.Size([8384]))

In [7]:
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim):
        super(SimpleRNN, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim,hidden_size=hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim,vocab_size)

    def forward(self,x):
        embedding = self.embedding(x) # Shape: (batch_size, seq_len, embedding_dim)
        output,hidden = self.rnn(embedding) # Shape: (batch_size, seq_length, hidden_size)
        output = self.fc(output[:,-1,:]) # Shape: (batch_size,vocab_size)
        return output

In [8]:
vocab_size = tokenizer.n_vocab
embedding_dim = 50
hidden_dim = 128

learning_rate = 0.001

In [9]:
model = SimpleRNN(vocab_size,embedding_dim,hidden_dim)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)

In [17]:
epochs = 100

for epoch in range(epochs):
    y_pred = model(data)
    loss = loss_fn(y_pred,targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 5.5384
Epoch 2, Loss: 5.5289
Epoch 3, Loss: 5.5195
Epoch 4, Loss: 5.5102
Epoch 5, Loss: 5.5007
Epoch 6, Loss: 5.4912
Epoch 7, Loss: 5.4815
Epoch 8, Loss: 5.4716
Epoch 9, Loss: 5.4618
Epoch 10, Loss: 5.4520
Epoch 11, Loss: 5.4424
Epoch 12, Loss: 5.4330
Epoch 13, Loss: 5.4239
Epoch 14, Loss: 5.4151
Epoch 15, Loss: 5.4067
Epoch 16, Loss: 5.3986
Epoch 17, Loss: 5.3907
Epoch 18, Loss: 5.3828
Epoch 19, Loss: 5.3748
Epoch 20, Loss: 5.3667
Epoch 21, Loss: 5.3584
Epoch 22, Loss: 5.3498
Epoch 23, Loss: 5.3412
Epoch 24, Loss: 5.3323
Epoch 25, Loss: 5.3234
Epoch 26, Loss: 5.3144
Epoch 27, Loss: 5.3052
Epoch 28, Loss: 5.2960
Epoch 29, Loss: 5.2865
Epoch 30, Loss: 5.2767
Epoch 31, Loss: 5.2666
Epoch 32, Loss: 5.2561
Epoch 33, Loss: 5.2454
Epoch 34, Loss: 5.2344
Epoch 35, Loss: 5.2234
Epoch 36, Loss: 5.2123
Epoch 37, Loss: 5.2013
Epoch 38, Loss: 5.1905
Epoch 39, Loss: 5.1796
Epoch 40, Loss: 5.1687
Epoch 41, Loss: 5.1577
Epoch 42, Loss: 5.1463
Epoch 43, Loss: 5.1346
Epoch 44, Loss: 5.12

In [33]:
input_seq = ["weather"]
input_indices = []

for word in input_seq:
    input_indices.extend(tokenizer.encode(word))

input_indices = torch.tensor(input_indices)

model.eval()

input_indices = input_indices.unsqueeze(0)

with torch.no_grad():
    output = model(input_indices)
    predicted_index = torch.argmax(output,dim=-1).item()
    print(predicted_index)

predicted_word = tokenizer.decode([predicted_index])

print("Predicted next word:", predicted_word)

531
Predicted next word:  said
