In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Step 1: Prepare French sentences
french_sentences = [
    "je suis étudiant",
    "bonjour",
    "j'aime apprendre le français",
    "le ciel est bleu",
    "c'est une belle journée",
    "merci beaucoup",
    "je vais bien",
    "où est la bibliothèque",
    "il fait froid aujourd'hui",
    "nous aimons voyager"
]

# Step 2: Sort by length (number of words)
french_sentences.sort(key=lambda x: len(x.split()))

# Step 3: Build vocabulary (map word → index)
def build_vocab(sentences):
    tokens = set(word for sentence in sentences for word in sentence.split())
    vocab = {word: idx+2 for idx, word in enumerate(sorted(tokens))}
    vocab["<PAD>"] = 0
    vocab["<UNK>"] = 1
    return vocab

vocab = build_vocab(french_sentences)

# Step 4: Dataset class
class FrenchDataset(Dataset):
    def __init__(self, sentences, vocab):
        self.sentences = sentences
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.sentences[idx].split()
        numericalized = [self.vocab.get(word, self.vocab["<UNK>"]) for word in tokens]
        return torch.tensor(numericalized, dtype=torch.long)

# Step 5: Custom collate function for padding
def collate_fn(batch):
    padded_batch = pad_sequence(batch, batch_first=True, padding_value=vocab["<PAD>"])
    return padded_batch

# Step 6: Create Dataset and DataLoader
dataset = FrenchDataset(french_sentences, vocab)
dataloader = DataLoader(dataset, batch_size=4, collate_fn=collate_fn)

# Step 7: Print each batch
print("Vocabulary:", vocab)
print("\nBatches:")
for i, batch in enumerate(dataloader):
    print(f"\nBatch {i+1}:")
    print(batch)


Vocabulary: {'aimons': 2, 'apprendre': 3, "aujourd'hui": 4, 'beaucoup': 5, 'belle': 6, 'bibliothèque': 7, 'bien': 8, 'bleu': 9, 'bonjour': 10, "c'est": 11, 'ciel': 12, 'est': 13, 'fait': 14, 'français': 15, 'froid': 16, 'il': 17, "j'aime": 18, 'je': 19, 'journée': 20, 'la': 21, 'le': 22, 'merci': 23, 'nous': 24, 'où': 25, 'suis': 26, 'une': 27, 'vais': 28, 'voyager': 29, 'étudiant': 30, '<PAD>': 0, '<UNK>': 1}

Batches:

Batch 1:
tensor([[10,  0,  0],
        [23,  5,  0],
        [19, 26, 30],
        [19, 28,  8]])

Batch 2:
tensor([[24,  2, 29,  0],
        [18,  3, 22, 15],
        [22, 12, 13,  9],
        [11, 27,  6, 20]])

Batch 3:
tensor([[25, 13, 21,  7],
        [17, 14, 16,  4]])
