# What happens before the DataLoader 
1.	Prepare Raw Data
2.	Define Tokenizer
3.	Build Vocabulary
4.	Create a Custom Dataset
5.	Write a Collate Function
6.	Then Create the DataLoader

## Raw Data 


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader

sentences = [
    "This is a correct sentence.",
    "This not grammar good.",
    "Is this sentence correct?"
]

class CustomDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

## Define Tokenizer + Build Vocab 
### Tokenization 
Convert raw text into a sequence of string tokens (words, subwords).

### Vocabulary Construction 
Convert each token into an integer index using a consistent mapping.
- "basic_english" splits text into lowercased words
- specials includes <pad> for padding and <unk> for unknown tokens
- set_default_index ensures OOV tokens map to <unk>

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Step 1: Create tokenizer
tokenizer = get_tokenizer("basic_english")

# Step 2: Define a generator that yields token lists
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Step 3: Build vocab
vocab = build_vocab_from_iterator(yield_tokens(sentences), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])


## Collate Function and Padding 
Ensure that all sequences in a batch have the same length.


In [3]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Tokenize
    tokenized = [tokenizer(sentence) for sentence in batch]
    # Convert tokens to indices
    indexed = [torch.tensor(vocab(tokens), dtype=torch.long) for tokens in tokenized]
    # Pad to equal length
    padded = pad_sequence(indexed, batch_first=True, padding_value=vocab["<pad>"])
    return padded

## Data Loader 


In [4]:
dataset = CustomDataset(sentences)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

for batch in loader:
    print(batch)

tensor([[ 2, 11, 10,  9,  3,  0],
        [ 2,  5,  8,  4,  6,  3]])
tensor([[5, 2, 6, 4, 7]])
