In [None]:
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

In [None]:
src = "An apple a day keeps the doctor away"
src_tensor = None


In [None]:
tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
tokenizer_de = get_tokenizer('spacy', language='de_core_news_sm')

In [None]:
tokenizer_en("a bag on the desk")

In [None]:
train_iter = Multi30k(root=".data", split='train', language_pair=('de', 'en'))


In [None]:
def yield_tokens(data_iter, tokenizer, language):
    for data_sample in data_iter:
        yield tokenizer(data_sample[language])

In [25]:
first = None
for i in train_iter:
    first = i
    break
first



('Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'Two young, White males are outside near many bushes.')

In [None]:
first[0]

In [None]:
first[1]

In [None]:
tokenizer_de(first[0])

In [None]:
# 构建词汇表
vocab_de = build_vocab_from_iterator(
    yield_tokens(train_iter, tokenizer_de, language=0),
    specials=['<unk>', '<pad>', '<bos>', '<eos>'],
    min_freq=2
)
vocab_en = build_vocab_from_iterator(
    yield_tokens(train_iter, tokenizer_en, language=1),
    specials=['<unk>', '<pad>', '<bos>', '<eos>'],
    min_freq=2
)

# 设置默认未知词标记
vocab_de.set_default_index(vocab_de['<unk>'])
vocab_en.set_default_index(vocab_en['<unk>'])

In [None]:
src_list =tokenizer_en(src)
src_ids = [vocab_en[i] for i in src_list]
src_ids=torch.tensor(src_ids).to("cuda:0")
# when you can name it, you get the power over it
src_ids

In [None]:
[i*2 for i in range(5)]

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    
    de_batch, en_batch = [], []
    for de, en in batch:
        # 德语端添加 <bos> 和 <eos>
        de_processed = [vocab_de['<bos>']] + vocab_de(tokenizer_de(de)) + [vocab_de['<eos>']]
        # 英语端同理
        en_processed = [vocab_en['<bos>']] + vocab_en(tokenizer_en(en)) + [vocab_en['<eos>']]
        
        de_batch.append(torch.tensor(de_processed, dtype=torch.long))
        en_batch.append(torch.tensor(en_processed, dtype=torch.long))
    
    # 填充到相同长度
    de_padded = pad_sequence(de_batch, padding_value=vocab_de['<pad>'])
    en_padded = pad_sequence(en_batch, padding_value=vocab_en['<pad>'])

    return de_padded, en_padded

In [None]:
BATCH_SIZE = 64

# 重新加载数据集（因为迭代器只能遍历一次）
train_iter = Multi30k(split='train', language_pair=('de', 'en'))
valid_iter = Multi30k(split='valid', language_pair=('de', 'en'))

train_loader = DataLoader(
    list(train_iter),  # 转换为列表（Multi30k 是迭代器）
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch
)

valid_loader = DataLoader(
    list(valid_iter),
    batch_size=BATCH_SIZE,
    collate_fn=collate_batch
)

In [26]:
for de, en in train_loader:
    print(f"德语张量形状: {de.shape}")  # (seq_len, batch_size)
    print(f"英语张量形状: {en.shape}")
    break

德语张量形状: torch.Size([24, 64])
英语张量形状: torch.Size([23, 64])
