# Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Processing

In [3]:
data = None

with open("text.txt", "r") as f:
    data = f.read()

In [4]:
data = data[:100000]

In [5]:
vocab = set(data)

vocab = sorted(vocab)

print(f"Vocabulary -\n{vocab}\n")
print(f"Vocab Size - {len(vocab)}")

Vocabulary -
['\n', ' ', '!', '&', "'", ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

Vocab Size - 61


In [6]:
char_to_idx = {char: i for i, char in enumerate(vocab)}
idx_to_char = {v: k for k, v in char_to_idx.items()}

In [7]:
split_index = int(len(data) * 0.8)

train_data = data[:split_index]
val_data = data[split_index:]

# Dataset and DataLoader

In [8]:
class ShakespeareDataset(Dataset):
    def __init__(self, data, vocab, context_length):

        self.data = data
        self.context_length = context_length
        
        self.vocab = vocab
        self.vocab_size = len(vocab)


    def __len__(self):
        return len(self.data) - self.context_length - 1


    def __getitem__(self, i):

        context = self.data[i : i+self.context_length]
        next_token = self.data[i+self.context_length]

        context_tokens = list(context)
        encoded_tokens = [char_to_idx[token] for token in context_tokens]
        encoded_next_token = char_to_idx[next_token]

        return torch.tensor(encoded_tokens), torch.tensor(encoded_next_token)

In [9]:
train_dataset = ShakespeareDataset(train_data, vocab, context_length=100)
val_dataset = ShakespeareDataset(val_data, vocab, context_length=100)

batch_size = 128

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [10]:
a, b = next(iter(train_loader))

a.size(), b.size()

(torch.Size([128, 100]), torch.Size([128]))