# Seq2Seq To Reverse List of Numbers

In [33]:
import torch
import torch.nn as nn
import torch.optim as optim

# Step 1: Define the Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        self.encoder = nn.GRU(input_size, hidden_size, batch_first=True)
        self.decoder = nn.GRU(output_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, src, tgt):
        # Encoder
        _, hidden = self.encoder(src)
        
        # Decoder
        outputs, _ = self.decoder(tgt, hidden)
        predictions = self.fc(outputs)
        
        return predictions

# Step 2: Prepare Toy Data
# Example: Reverse a sequence of numbers
src_data = torch.tensor([[1, 2, 3, 4, 5]], dtype=torch.float32).unsqueeze(-1)  # Input sequence
tgt_data = torch.tensor([[5, 4, 3, 2, 1]], dtype=torch.float32).unsqueeze(-1)  # Target sequence (reversed)

# Step 3: Initialize Model, Loss, and Optimizer
model = Seq2Seq(input_size=1, hidden_size=10, output_size=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Step 4: Train the Model
for epoch in range(1000):
    model.zero_grad()
    output = model(src_data, tgt_data[:, :-1, :])  # Teacher forcing: use tgt as input to decoder
    loss = criterion(output, tgt_data[:, 1:, :])   # Compare predictions to shifted tgt
    loss.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

# Step 5: Test the Model
with torch.no_grad():
    test_output = model(src_data, tgt_data[:, :-1, :])
    print("Predicted:", test_output.squeeze().numpy())
    print("Actual:", tgt_data[:, 1:, :].squeeze().numpy())

Epoch 0, Loss: 7.199328899383545
Epoch 100, Loss: 0.0003412325750105083
Epoch 200, Loss: 7.838086276024114e-06
Epoch 300, Loss: 7.689288850087905e-07
Epoch 400, Loss: 4.234293982108284e-08
Epoch 500, Loss: 1.2916174796373525e-09
Epoch 600, Loss: 2.155786660296144e-11
Epoch 700, Loss: 3.197442310920451e-13
Epoch 800, Loss: 3.552713678800501e-14
Epoch 900, Loss: 6.039613253960852e-14
Predicted: [4.0000005 3.        1.9999999 1.0000001]
Actual: [4. 3. 2. 1.]


In [23]:
import csv
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

path = '/Users/pranjal/code/'

##############################################################################
# 1) LOAD & FILTER TATOEBA DATA (English-French)
##############################################################################
lang1, lang2 = "eng", "fra"
id2sent = {}

with open(path + "sentences.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        if len(row) == 3:
            sid, slang, stext = row
            if slang in [lang1, lang2]:
                id2sent[sid] = (slang, stext)

pairs = []
with open(path + "links.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        if len(row) < 2:
            continue
        id1, id2 = row
        if id1 in id2sent and id2 in id2sent:
            la, ta = id2sent[id1]
            lb, tb = id2sent[id2]
            if la == lang1 and lb == lang2:
                pairs.append((ta, tb))
            elif la == lang2 and lb == lang1:
                pairs.append((tb, ta))

# Filter to short sentences only (e.g., <= 50 chars)
pairs = [(src, tgt) for (src, tgt) in pairs if len(src) <= 50 and len(tgt) <= 50]

# Subsample for quick demo
pairs = pairs[:5000]  
print(f"Total short pairs: {len(pairs)}")

Total short pairs: 5000


In [25]:
pairs[0]

("Let's try something.", 'Essayons quelque chose\u202f!')

In [31]:
import re
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

##############################################################################
# 2) BUILD VOCAB (Word-level)
##############################################################################
def preprocess(text):
    """Separate punctuation and lowercase."""
    text = re.sub(r"([.!?,])", r" \1", text.lower())  # Separate punctuation
    return text.strip()

def build_vocab(sentences):
    """Create word-level vocabulary with proper preprocessing."""
    words = set()
    for s in sentences:
        for w in preprocess(s).split():
            words.add(w)
    return {
        '<pad>': 0, '<unk>': 1, '<start>': 2, '<end>': 3,
        **{w: i+4 for i, w in enumerate(sorted(words))}
    }

# Build separate vocabs for source (English) and target (French)
src_vocab = build_vocab([p[0] for p in pairs])  # English
tgt_vocab = build_vocab([p[1] for p in pairs])  # French

print(f"[DEBUG] src_vocab size = {len(src_vocab)} | tgt_vocab size = {len(tgt_vocab)}")

##############################################################################
# 3) DATASET & DATALOADER (Word-level)
##############################################################################
class TranslationDataset(Dataset):
    def __init__(self, pairs, max_len=15):
        self.pairs = pairs
        self.max_len = max_len
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        src_text, tgt_text = self.pairs[idx]
        
        def tokenize(text, vocab):
            """Tokenize with preprocessing and padding."""
            words = preprocess(text).split()
            tokens = ['<start>'] + words + ['<end>']
            tokens = tokens[:self.max_len] + ['<pad>']*(self.max_len - len(tokens))
            return [vocab.get(w, 1) for w in tokens]  # Use <unk> for unknown words
        
        src_indices = tokenize(src_text, src_vocab)
        tgt_indices = tokenize(tgt_text, tgt_vocab)
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

dataset = TranslationDataset(pairs, max_len=15)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

##############################################################################
# 4) MODEL - Seq2Seq with Attention
##############################################################################
class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim=128, hidden_size=256):
        super().__init__()
        # Encoder
        self.enc_emb = nn.Embedding(src_vocab_size, emb_dim)
        self.encoder = nn.GRU(emb_dim, hidden_size, bidirectional=True)
        
        # Decoder
        self.dec_emb = nn.Embedding(tgt_vocab_size, emb_dim)
        self.decoder = nn.GRU(emb_dim + 2*hidden_size, hidden_size)  # +2*hidden_size for attention
        
        # Attention
        self.attn = nn.Linear(2*hidden_size + hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, tgt_vocab_size)
    
    def forward(self, src, tgt_in):
        # Encode
        src_emb = self.enc_emb(src)  # (seq_len, batch, emb_dim)
        enc_output, hidden = self.encoder(src_emb)  # enc_output: (seq_len, batch, 2*hidden_size)
        
        # Decode with attention
        tgt_emb = self.dec_emb(tgt_in)  # (seq_len, batch, emb_dim)
        outputs = []
        for t in range(tgt_emb.size(0)):
            # Attention
            energy = torch.tanh(self.attn(torch.cat((hidden[-1], enc_output), dim=2)))  # (seq_len, batch, hidden_size)
            attention = F.softmax(self.v(energy), dim=0)  # (seq_len, batch, 1)
            context = (attention * enc_output).sum(dim=0)  # (batch, 2*hidden_size)
            
            # Decoder step
            dec_input = torch.cat((tgt_emb[t], context), dim=1).unsqueeze(0)  # (1, batch, emb_dim + 2*hidden_size)
            out, hidden = self.decoder(dec_input, hidden)
            outputs.append(self.fc(out))
        
        return torch.stack(outputs)  # (seq_len, batch, tgt_vocab_size)

model = Seq2Seq(len(src_vocab), len(tgt_vocab), emb_dim=128, hidden_size=256)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
criterion = nn.CrossEntropyLoss(ignore_index=0)

##############################################################################
# 5) TRANSLATE FUNCTION (Word-level)
##############################################################################
def translate(model, text, max_len=15):
    """Greedy decoding with attention."""
    model.eval()
    rev_tgt = {v: k for k, v in tgt_vocab.items()}

    # Tokenize source
    words = preprocess(text).split()
    tokens = ['<start>'] + words + ['<end>']
    tokens = tokens[:max_len] + ['<pad>']*(max_len - len(tokens))
    src_ids = [src_vocab.get(w, 1) for w in tokens]
    src_tensor = torch.tensor(src_ids).unsqueeze(1)  # (seq_len, 1)
    
    # Encode
    with torch.no_grad():
        src_emb = model.enc_emb(src_tensor)  # (seq_len, 1, emb_dim)
        enc_output, hidden = model.encoder(src_emb)  # enc_output: (seq_len, 1, 2*hidden_size)
        
        # Decoder init
        dec_input = torch.tensor([[tgt_vocab['<start>']]])  # (1, 1)
        output_words = []
        
        for step in range(max_len):
            # Attention
            energy = torch.tanh(model.attn(torch.cat((hidden[-1], enc_output), dim=2)))  # (seq_len, 1, hidden_size)
            attention = F.softmax(model.v(energy), dim=0)  # (seq_len, 1, 1)
            context = (attention * enc_output).sum(dim=0)  # (1, 2*hidden_size)
            
            # Decoder step
            dec_emb = model.dec_emb(dec_input)  # (1, 1, emb_dim)
            dec_input = torch.cat((dec_emb.squeeze(0), context), dim=1).unsqueeze(0)  # (1, 1, emb_dim + 2*hidden_size)
            out, hidden = model.decoder(dec_input, hidden)
            logits = model.fc(out).squeeze(0)  # (1, vocab_size) -> (vocab_size,)
            
            pred_id = logits.argmax(dim=-1).item()
            if pred_id == tgt_vocab['<end>'] and step > 3:  # Minimum 3 words
                break
            output_words.append(rev_tgt.get(pred_id, '<unk>'))
            
            # Next input is the predicted token
            dec_input = torch.tensor([[pred_id]])
    
    return ' '.join(output_words)

##############################################################################
# 6) TRAIN LOOP (with debug prints)
##############################################################################
EPOCHS = 100
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    
    for i, (src_batch, tgt_batch) in enumerate(tqdm(dataloader)):
        optimizer.zero_grad()
        
        # Reshape to (seq_len, batch)
        src_batch = src_batch.permute(1, 0)
        tgt_in = tgt_batch[:, :-1].permute(1, 0)
        tgt_out = tgt_batch[:, 1:]
        
        # Forward
        outputs = model(src_batch, tgt_in)  # (seq_len, batch, vocab_size)
        
        # Compute loss
        loss = criterion(outputs.reshape(-1, len(tgt_vocab)), tgt_out.reshape(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        
        total_loss += loss.item()
        
        # Debug on first batch
        if i == 0:
            print(f"[DEBUG] EPOCH {epoch+1}, BATCH=0, loss={loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    scheduler.step(avg_loss)  # Update learning rate
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f}")
    
    # Quick test
    sample_src, sample_tgt = random.choice(pairs)
    prediction = translate(model, sample_src)
    print(f"[DEBUG] Sample Source: {sample_src}")
    print(f"[DEBUG] True Target:   {sample_tgt}")
    print(f"[DEBUG] Prediction:    {prediction}")
    print("-"*30)

[DEBUG] src_vocab size = 2828 | tgt_vocab size = 3547


  0%|          | 0/157 [00:00<?, ?it/s]


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)