In [1]:
import torch
import math
import spacy
from torch import nn

from torch import optim
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence

In [2]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        # '-1' means last dimension. 
        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta
        return out

In [3]:
class ScaleDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaleDotProductAttention,self).__init__()
        self.softmax=nn.Softmax(dim=-1)
    
    def forward(self,q,k,v,mask=None,e=1e-12):
        batch_size,head,length,d_tensor=k.size()
        k_t=k.transpose(2,3)
        score=(q@k_t)/math.sqrt(d_tensor)
        
        if mask is not None:
            score=score.masked_fill(mask==0,-10000)
            
        score=self.softmax(score)
        v=score@v
        
        return v,score

In [4]:
class PositionwiseFeedForward(nn.Module):
    
    def __init__(self,d_model,hidden,drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self,x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [5]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self,n_head,d_model):
        super(MultiHeadAttention,self).__init__()
        self.n_head=n_head
        self.attention=ScaleDotProductAttention()
        self.w_q=nn.Linear(d_model,d_model)
        self.w_k=nn.Linear(d_model,d_model)
        self.w_v=nn.Linear(d_model,d_model)
        self.w_concat=nn.Linear(d_model,d_model) #linear after concat
    
    def split(self,tensor):
        batch_size,length,d_model=tensor.size()
        d_tensor=d_model//self.n_head
        tensor=tensor.view(batch_size,length,self.n_head,d_tensor).transpose(1,2)
        return tensor
    
    def concat(self, tensor):
        batch_size,head,length,d_tensor=tensor.size()
        d_model=head*d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor
            
    def forward(self,q,k,v,mask=None):
        q,k,v=self.w_q(q),self.w_k(k),self.w_v(v)
        q,k,v=self.split(q),self.split(k),self.split(v)
        out, attention = self.attention(q, k, v, mask=mask)
        
        out = self.concat(out)
        out = self.w_concat(out)
        return out

In [6]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_len,device):
        super(PositionalEncoding,self).__init__()

        self.encoding=torch.zeros((max_len,d_model), device=device)
        self.encoding.requires_grad=False

        pos=torch.arange(0,max_len,device=device)
        pos=pos.float().unsqueeze(dim=1)
        
        _2i=torch.arange(0,d_model,step=2,device=device).float()
        self.encoding[:,0::2]=torch.sin(pos/(10000**(_2i/d_model)))
        self.encoding[:,1::2]=torch.cos(pos/(10000**(_2i/d_model)))
        
    def forward(self,x):
        batch_size,seq_len=x.size()
        return self.encoding[:seq_len,:]

In [7]:
class TokenEmbedding(nn.Embedding):
    
    def __init__(self, vocab_size,d_model):
        super().__init__(vocab_size,d_model,padding_idx=1)

In [8]:
class TransformerEmbedding(nn.Module):
    
    def __init__(self, vocab_size,d_model,max_len,drop_prob,device):
        super().__init__()
        self.tok_emb=TokenEmbedding(vocab_size,d_model)
        self.pos_emb=PositionalEncoding(d_model,max_len,device)
        self.drop_out=nn.Dropout(p=drop_prob)
        
    def  forward(self,x):
        tok_emb=self.tok_emb(x)
        pos_emb=self.pos_emb(x)
        return self.drop_out(tok_emb+pos_emb)

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model,ffn_hidden,n_head,drop_prob):
        super().__init__()
        self.attention=MultiHeadAttention(d_model=d_model,n_head=n_head)
        self.norm1=LayerNorm(d_model=d_model)
        self.dropout1=nn.Dropout(p=drop_prob)
        
        self.ffn=PositionwiseFeedForward(d_model=d_model,hidden=ffn_hidden,drop_prob=drop_prob)
        self.norm2=LayerNorm(d_model=d_model)
        self.dropout2=nn.Dropout(p=drop_prob)
        
        
    def forward(self,x,src_mask):
        _x=x
        x=self.attention(q=x,k=x,v=x,mask=src_mask)
        x=self.dropout1(x)
        x=self.norm1(x+_x)
        
        _x=x
        x=self.ffn(x)
        x=self.dropout2(x)
        x=self.norm2(x+_x)
        return x
         
        

In [10]:
class DecoderLayer(nn.Module):
    def __init__(self,d_model,ffn_hidden,n_head,drop_prob):
        super().__init__()
        self.attention=MultiHeadAttention(d_model=d_model,n_head=n_head)
        self.dropout1=nn.Dropout(p=drop_prob)
        self.norm1=LayerNorm(d_model=d_model)
        
        self.dec_enc_attention=MultiHeadAttention(d_model=d_model,n_head=n_head)
        self.dropout2=nn.Dropout(p=drop_prob)
        self.norm2=LayerNorm(d_model=d_model)
        
        self.ffn=PositionwiseFeedForward(d_model=d_model,hidden=ffn_hidden,drop_prob=drop_prob)
        self.dropout3=nn.Dropout(p=drop_prob)
        self.norm3=LayerNorm(d_model=d_model)
    
    def forward(self,dec,enc,trg_mask,src_mask):
        _x=dec
        x=self.attention(q=dec,k=dec,v=dec,mask=trg_mask)
        x=self.dropout1(x)
        x=self.norm1(x+_x)
        
        if enc is not None:
            _x=x
            x=self.dec_enc_attention(q=x,k=enc,v=enc,mask=src_mask)
            x=self.dropout2(x)
            x=self.norm2(x+_x)
            
        _x=x
        x=self.ffn(x)
        x=self.dropout3(x)
        x=self.norm3(x+_x)
        return x
            
        

In [11]:
class Encoder(nn.Module):
    def __init__(self, enc_voc_size,max_len,d_model,ffn_hidden,n_head,n_layers,drop_prob,device):
        super().__init__()
        self.emb=TransformerEmbedding(vocab_size=enc_voc_size,d_model=d_model,max_len=max_len,drop_prob=drop_prob,device=device)
        self.layers=nn.ModuleList([EncoderLayer(d_model=d_model,ffn_hidden=ffn_hidden,n_head=n_head,drop_prob=drop_prob) for _ in range(0, n_layers)])
    
    def forward(self,x,src_mask):
        x=self.emb(x)
        for layer in self.layers:
            x=layer(x,src_mask)
            return x

In [12]:
class Decoder(nn.Module):
    
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model, drop_prob=drop_prob,max_len=max_len, vocab_size=dec_voc_size, device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head,drop_prob=drop_prob) for _ in range(n_layers)])
        
        self.linear=nn.Linear(d_model,dec_voc_size)
        
    def forward(self,trg,enc_src,trg_mask,src_mask):
        trg=self.emb(trg)
        for layer in self.layers:
            trg=layer(trg,enc_src,trg_mask,src_mask)
            
        output=self.linear(trg)
        return output

In [13]:
class Transformer(nn.Module):
    def __init__(self,src_pad_idx,trg_pad_idx,trg_sos_idx,enc_voc_size,dec_voc_size,d_model,n_head,max_len,ffn_hidden,n_layers,drop_prob,device):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.device = device
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers,
                               device=device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(self.device)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

In [14]:
batch_size = 128
max_len = 256
d_model = 512
n_layers = 6
n_heads = 8
ffn_hidden = 2048
drop_prob = 0.1

init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 1000
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"device:{device}")

device:cuda:0


In [15]:
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

In [16]:
tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
tokenizer_de = get_tokenizer('spacy', language='de_core_news_sm')

  _C._set_default_tensor_type(t)


In [17]:
train_iter = Multi30k(root=".data", split='train', language_pair=('de', 'en'))

In [18]:
def yield_tokens(data_iter, tokenizer, language):
    for data_sample in data_iter:
        yield tokenizer(data_sample[language])

In [19]:
# 构建词汇表
vocab_de = build_vocab_from_iterator(
    yield_tokens(train_iter, tokenizer_de, language=0),
    specials=['<unk>', '<pad>', '<bos>', '<eos>'],
    min_freq=2
)
vocab_en = build_vocab_from_iterator(
    yield_tokens(train_iter, tokenizer_en, language=1),
    specials=['<unk>', '<pad>', '<bos>', '<eos>'],
    min_freq=2
)

# 设置默认未知词标记
vocab_de.set_default_index(vocab_de['<unk>'])
vocab_en.set_default_index(vocab_en['<unk>'])



In [20]:
src_pad_idx = 1
trg_pad_idx = 1
trg_sos_idx = 2
enc_voc_size = len(vocab_en)
dec_voc_size = len(vocab_de)
model = Transformer(src_pad_idx=src_pad_idx,trg_pad_idx=trg_pad_idx,trg_sos_idx=trg_sos_idx,
                    d_model=d_model,enc_voc_size=enc_voc_size, 
                    dec_voc_size=dec_voc_size,max_len=max_len,ffn_hidden=ffn_hidden,n_head=n_heads,
                    n_layers=n_layers, drop_prob=drop_prob,device=device).to(device)

# come here

In [21]:
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)

In [32]:
def collate_batch(batch):
    
    de_batch, en_batch = [], []
    for de, en in batch:
        # 德语端添加 <bos> 和 <eos>
        de_processed = [vocab_de['<bos>']] + vocab_de(tokenizer_de(de)) + [vocab_de['<eos>']]
        # 英语端同理
        en_processed = [vocab_en['<bos>']] + vocab_en(tokenizer_en(en)) + [vocab_en['<eos>']]
        
        de_batch.append(torch.tensor(de_processed, dtype=torch.long))
        en_batch.append(torch.tensor(en_processed, dtype=torch.long))
    
    # 填充到相同长度
    de_padded = pad_sequence(de_batch, padding_value=vocab_de['<pad>'], batch_first=True)
    en_padded = pad_sequence(en_batch, padding_value=vocab_en['<pad>'], batch_first=True)

    return de_padded, en_padded

In [33]:
BATCH_SIZE = 128

# 重新加载数据集（因为迭代器只能遍历一次）
train_iter = Multi30k(split='train', language_pair=('de', 'en'))
valid_iter = Multi30k(split='valid', language_pair=('de', 'en'))

train_loader = DataLoader(
    list(train_iter),  # 转换为列表（Multi30k 是迭代器）
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_batch
)

valid_loader = DataLoader(
    list(valid_iter),
    batch_size=BATCH_SIZE,
    collate_fn=collate_batch
)

In [44]:
losssum=0
for de, en in train_loader:
    print(f"德语张量形状: {de.shape}")  # (seq_len, batch_size)
    print(f"英语张量形状: {en.shape}")
    trg=de.to(device)
    src=en.to(device)
    print(trg[:, :-1].shape)
    output = model(src, trg[:, :-1])
    #print(output)
    loss = criterion(
                output.reshape(-1, output.shape[2]),
                trg[:, 1:].reshape(-1)
            )
    #print(loss)
    losssum = losssum+loss.item()
print(losssum/len(train_loader))

德语张量形状: torch.Size([128, 27])
英语张量形状: torch.Size([128, 24])
torch.Size([128, 26])
德语张量形状: torch.Size([128, 28])
英语张量形状: torch.Size([128, 29])
torch.Size([128, 27])
德语张量形状: torch.Size([128, 29])
英语张量形状: torch.Size([128, 26])
torch.Size([128, 28])
德语张量形状: torch.Size([128, 27])
英语张量形状: torch.Size([128, 28])
torch.Size([128, 26])
德语张量形状: torch.Size([128, 30])
英语张量形状: torch.Size([128, 30])
torch.Size([128, 29])
德语张量形状: torch.Size([128, 32])
英语张量形状: torch.Size([128, 31])
torch.Size([128, 31])
德语张量形状: torch.Size([128, 28])
英语张量形状: torch.Size([128, 24])
torch.Size([128, 27])
德语张量形状: torch.Size([128, 24])
英语张量形状: torch.Size([128, 25])
torch.Size([128, 23])
德语张量形状: torch.Size([128, 30])
英语张量形状: torch.Size([128, 29])
torch.Size([128, 29])
德语张量形状: torch.Size([128, 28])
英语张量形状: torch.Size([128, 28])
torch.Size([128, 27])
德语张量形状: torch.Size([128, 26])
英语张量形状: torch.Size([128, 26])
torch.Size([128, 25])
德语张量形状: torch.Size([128, 30])
英语张量形状: torch.Size([128, 30])
torch.Size([128, 29])
德语张量形状: torch.Si

In [None]:
src = "Two young, White males are outside near many bushes."
trg = "Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche."
src_list =tokenizer_en(src)
src_ids = [vocab_en[i] for i in src_list]

src_ids = torch.tensor(src_ids, device=device).unsqueeze(0)


trg_list =tokenizer_de(trg)
trg_ids = [vocab_de[i] for i in trg_list]

trg_ids = torch.tensor(trg_ids, device=device).unsqueeze(0)
trg_ids.shape

torch.Size([1, 13])

In [None]:
output=model(src_ids,trg_ids[:, :-1])

In [None]:
output

tensor([[[-0.0028, -0.3290, -0.6634,  ..., -0.5386,  0.6408, -0.1188],
         [-0.3230,  0.7530,  0.1407,  ..., -0.1449,  0.4592,  0.4360],
         [-0.7534,  0.8151,  0.2188,  ...,  0.7032,  1.0184, -0.0790],
         ...,
         [-0.0218, -0.9069,  0.2163,  ...,  0.0677,  0.9946,  0.1321],
         [-0.0149,  0.8431, -0.5896,  ..., -0.2367,  0.2445, -0.2734],
         [-0.1857,  0.1189, -0.6281,  ...,  0.1665,  0.5034, -1.0686]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [None]:
output_reshape = output.contiguous().view(-1, output.shape[-1])

In [None]:
output_reshape.shape

torch.Size([12, 8014])

In [None]:
trg_ids = trg_ids[:, 1:].contiguous().view(-1)
trg_ids.shape

torch.Size([12])

In [None]:



loss = criterion(output_reshape, trg_ids)
loss


tensor(8.9104, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
import torch
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform, Sequential, ToTensor, AddToken
from torch.utils.data import DataLoader

class DataLoader:
    def __init__(self, ext, tokenize_en, tokenize_de, init_token, eos_token):
        self.ext = ext
        self.tokenize_en = tokenize_en
        self.tokenize_de = tokenize_de
        self.init_token = init_token
        self.eos_token = eos_token
        self.source_vocab = None
        self.target_vocab = None
        print('dataset initializing start')

    def make_dataset(self):
        # Load Multi30k dataset (new API)
        train_data, valid_data, test_data = Multi30k(split=('train', 'valid', 'test'), language_pair=self.ext)
        return train_data, valid_data, test_data

    def build_vocab(self, train_data, min_freq):
        # Tokenize and build vocabulary for source and target languages
        def yield_tokens(data_iter, tokenizer, index):
            for data_pair in data_iter:
                yield tokenizer(data_pair[index])

        # Build source vocab (e.g., German)
        src_tokenizer = self.tokenize_de if self.ext[0] == 'de' else self.tokenize_en
        self.source_vocab = build_vocab_from_iterator(
            yield_tokens(train_data, src_tokenizer, 0),
            min_freq=min_freq,
            specials=['<unk>', '<pad>', '<sos>', '<eos>']
        )
        self.source_vocab.set_default_index(self.source_vocab['<unk>'])

        # Build target vocab (e.g., English)
        tgt_tokenizer = self.tokenize_en if self.ext[1] == 'en' else self.tokenize_de
        self.target_vocab = build_vocab_from_iterator(
            yield_tokens(train_data, tgt_tokenizer, 1),
            min_freq=min_freq,
            specials=['<unk>', '<pad>', '<sos>', '<eos>']
        )
        self.target_vocab.set_default_index(self.target_vocab['<unk>'])

    def make_iter(self, train_data, valid_data, test_data, batch_size, device):
        # Define transforms (tokenization + numericalization)
        def apply_transform(vocab, tokenizer):
            return Sequential(
                tokenizer,  # Tokenize
                AddToken(token=self.init_token, begin=True),  # Add <sos>
                AddToken(token=self.eos_token, begin=False),  # Add <eos>
                VocabTransform(vocab),  # Convert tokens to indices
                ToTensor()  # Convert to tensor
            )

        # Source transform (e.g., German)
        src_transform = apply_transform(
            self.source_vocab,
            self.tokenize_de if self.ext[0] == 'de' else self.tokenize_en
        )

        # Target transform (e.g., English)
        tgt_transform = apply_transform(
            self.target_vocab,
            self.tokenize_en if self.ext[1] == 'en' else self.tokenize_de
        )

        # Collate function for DataLoader
        def collate_fn(batch):
            src_batch, tgt_batch = zip(*batch)
            src_tensor = torch.stack([src_transform(text) for text in src_batch])
            tgt_tensor = torch.stack([tgt_transform(text) for text in tgt_batch])
            return src_tensor.to(device), tgt_tensor.to(device)

        # Create DataLoader instances
        train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
        valid_loader = DataLoader(valid_data, batch_size=batch_size, collate_fn=collate_fn)
        test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn)

        print('dataset initializing done')
        return train_loader, valid_loader, test_loader

In [None]:
src = torch.randint(0, 1, (10,3)).to("cuda:0")
trg = torch.randint(0, 1, (10,3)).to("cuda:0")
model(src,trg)

tensor([[[-1.0453e+00, -1.1709e-01, -1.6439e-01,  ...,  1.0379e+00,
          -9.8775e-02,  5.0622e-01],
         [-9.2150e-01, -3.3230e-01, -3.8646e-01,  ...,  7.3773e-01,
          -3.2671e-01,  6.5908e-01],
         [-1.0421e+00, -7.7217e-02, -3.3679e-01,  ...,  9.3421e-01,
          -2.3224e-01,  3.3822e-01]],

        [[-1.0624e+00,  1.1289e-01, -5.4295e-01,  ...,  7.3971e-01,
          -8.1724e-01,  9.7228e-02],
         [-6.6536e-01, -2.0220e-01, -3.4827e-01,  ...,  7.0489e-01,
           3.0030e-01,  4.9444e-01],
         [-7.0036e-01,  4.1444e-02, -1.9336e-01,  ...,  7.2447e-01,
          -3.1552e-01,  5.3214e-01]],

        [[-8.7925e-01, -1.1344e-03, -7.9653e-01,  ...,  1.2170e+00,
          -5.8409e-02,  4.8703e-01],
         [-3.7978e-01,  5.6242e-02, -9.7089e-01,  ...,  1.4455e+00,
          -7.0644e-01,  6.7805e-01],
         [-7.8774e-01, -3.1136e-01, -9.4371e-01,  ...,  1.5208e+00,
          -1.7221e-01,  6.7768e-01]],

        ...,

        [[-5.8559e-01, -5.2885e-01,