## LSTM

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import os

In [35]:
# Paths to tokenized XLCoST files
train_py_file = "train-Python-Javascript-tok.py"
train_js_file = "train-Python-Javascript-tok.js"
val_py_file = "val-Python-Javascript-tok.py"
val_js_file = "val-Python-Javascript-tok.js"

# Load tokenized lines (space-separated tokens per line)
def load_tokenized_code(py_path, js_path):
    with open(py_path, encoding='utf-8') as f_py, open(js_path, encoding='utf-8') as f_js:
        py_lines = [line.strip().split() for line in f_py if line.strip()]
        js_lines = [line.strip().split() for line in f_js if line.strip()]
    assert len(py_lines) == len(js_lines), "Mismatch between Python and JavaScript lines"
    return list(zip(py_lines, js_lines))

train_pairs = load_tokenized_code(train_py_file, train_js_file)
val_pairs   = load_tokenized_code(val_py_file, val_js_file)


## Data Preprocesing

In [36]:
SPECIAL_TOKENS = ['<pad>', '<sos>', '<eos>', '<unk>']

def build_vocab(token_lists, max_vocab_size=10000):
    all_tokens = list(chain.from_iterable(token_lists))
    token_counts = Counter(all_tokens)
    most_common = token_counts.most_common(max_vocab_size - len(SPECIAL_TOKENS))
    vocab = SPECIAL_TOKENS + [token for token, _ in most_common]
    token_to_id = {token: idx for idx, token in enumerate(vocab)}
    id_to_token = {idx: token for token, idx in token_to_id.items()}
    return token_to_id, id_to_token

def tokens_to_ids(tokens, token_to_id, add_sos=False, add_eos=False):
    ids = [token_to_id.get(token, token_to_id['<unk>']) for token in tokens]
    if add_sos:
        ids = [token_to_id['<sos>']] + ids
    if add_eos:
        ids += [token_to_id['<eos>']]
    return ids

py_token_to_id, py_id_to_token = build_vocab([py for py, _ in train_pairs])
js_token_to_id, js_id_to_token = build_vocab([js for _, js in train_pairs])


### Dataset and Dataloader

In [37]:
class CodeTranslationDataset(Dataset):
    def __init__(self, df):
        self.encoder_inputs = df['py_tensor'].tolist()     # Python input
        self.decoder_inputs = df['js_in_tensor'].tolist()  # JavaScript input (with <sos>)
        self.decoder_outputs = df['js_out_tensor'].tolist()  # JavaScript output (with <eos>)

    def __len__(self):
        return len(self.encoder_inputs)

    def __getitem__(self, idx):
        return self.encoder_inputs[idx], self.decoder_inputs[idx], self.decoder_outputs[idx]

def collate_fn(batch):
    enc_inputs, dec_inputs, dec_outputs = zip(*batch)
    enc_inputs_pad = pad_sequence(enc_inputs, batch_first=True, padding_value=py_token_to_id['<pad>'])
    dec_inputs_pad = pad_sequence(dec_inputs, batch_first=True, padding_value=js_token_to_id['<pad>'])
    dec_outputs_pad = pad_sequence(dec_outputs, batch_first=True, padding_value=js_token_to_id['<pad>'])
    return enc_inputs_pad, dec_inputs_pad, dec_outputs_pad


### Attention-based LSTM Model

In [38]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Parameter(torch.rand(hid_dim))

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return torch.softmax(attention, dim=1)


class AttnDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(hid_dim + emb_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attn = Attention(hid_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        a = self.attn(hidden[-1], encoder_outputs).unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(1), weighted.squeeze(1)), dim=1))
        return prediction, hidden, cell


class AttnEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell


class AttnSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.embedding.num_embeddings

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs


## Training Setup

In [39]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def step(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

def train(model, dataloader, optimizer, criterion, clip, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0
    for src, trg_in, trg_out in tqdm(dataloader, desc="Training", leave=False):
        src, trg_in, trg_out = src.to(DEVICE), trg_in.to(DEVICE), trg_out.to(DEVICE)
        optimizer.zero_grad()
        
        # Pass the teacher forcing ratio to the model
        output = model(src, trg_in, teacher_forcing_ratio)
        
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg_out = trg_out[:, 1:].reshape(-1)
        loss = criterion(output, trg_out)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        torch.cuda.empty_cache()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg_in, trg_out in dataloader:
            src, trg_in, trg_out = src.to(DEVICE), trg_in.to(DEVICE), trg_out.to(DEVICE)
            output = model(src, trg_in, teacher_forcing_ratio=0.0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg_out = trg_out[:, 1:].reshape(-1)
            loss = criterion(output, trg_out)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)


### Beam Search & BLEU

In [40]:
def beam_search(model, src_tensor, beam_width=3, max_len=300):
    model.eval()
    src_tensor = src_tensor.unsqueeze(0).to(DEVICE)

    encoder_outputs, hidden, cell = model.encoder(src_tensor)

    # Start with <sos> token
    sequences = [[[], 0.0, torch.tensor([js_token_to_id['<sos>']], device=DEVICE), hidden, cell]]

    for _ in range(max_len):
        all_candidates = []
        for seq, score, last_token, hidden, cell in sequences:
            with torch.no_grad():
                output, hidden, cell = model.decoder(last_token, hidden, cell, encoder_outputs)

            probs = torch.log_softmax(output, dim=1)
            topk = torch.topk(probs, beam_width)

            for i in range(beam_width):
                token = topk.indices[0][i]
                prob = topk.values[0][i].item()
                candidate = [seq + [token.item()], score + prob, token.unsqueeze(0), hidden, cell]
                all_candidates.append(candidate)

        sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]

        if all(seq[0][-1] == js_token_to_id['<eos>'] for seq in sequences):
            break

    return [js_id_to_token.get(idx, '<unk>') for idx in sequences[0][0] if idx != js_token_to_id['<eos>']]


def compute_bleu(pred_tokens, ref_tokens):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)

def evaluate_bleu(model, val_df, num_samples=5):
    total_bleu = 0
    for i in range(min(num_samples, len(val_df))):
        row = val_df.iloc[i]
        src_tensor = torch.tensor(row['py_input_ids'], dtype=torch.long)
        pred = beam_search(model, src_tensor)
        ref = row['js_tokens']
        bleu = compute_bleu(pred, ref)
        total_bleu += bleu
        print(f"Example {i+1} BLEU: {bleu:.4f}")
    avg_bleu = total_bleu / num_samples
    print(f"\nAvg BLEU over {num_samples} samples: {avg_bleu:.4f}")
    return avg_bleu

def save_model(model, path="best_model.pt"):
    torch.save(model.state_dict(), path)

def load_model(model, path="best_model.pt"):
    model.load_state_dict(torch.load(path))
    model.eval()

## Training

In [41]:
# Convert to PyTorch tensors
def make_tensor_dataset(pairs, src_vocab, tgt_vocab, max_len=300):
    data = []
    for src_tokens, tgt_tokens in pairs:
        src_ids = tokens_to_ids(src_tokens, src_vocab)
        tgt_in_ids = tokens_to_ids(tgt_tokens, tgt_vocab, add_sos=True)
        tgt_out_ids = tokens_to_ids(tgt_tokens, tgt_vocab, add_eos=True)

        # Skip overly long examples
        if len(src_ids) > max_len or len(tgt_in_ids) > max_len or len(tgt_out_ids) > max_len:
            continue

        data.append({
            "py_tensor": torch.tensor(src_ids, dtype=torch.long),
            "js_in_tensor": torch.tensor(tgt_in_ids, dtype=torch.long),
            "js_out_tensor": torch.tensor(tgt_out_ids, dtype=torch.long),
            "py_input_ids": src_ids,     # for beam search
            "js_tokens": tgt_tokens      # for BLEU evaluation
        })
    return pd.DataFrame(data)

# Convert the XLCoST pairs into DataFrames
train_df = make_tensor_dataset(train_pairs, py_token_to_id, js_token_to_id)
val_df   = make_tensor_dataset(val_pairs, py_token_to_id, js_token_to_id)


#### Train-validation split

In [42]:
train_loader = DataLoader(CodeTranslationDataset(train_df), batch_size=1, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(CodeTranslationDataset(val_df), batch_size=1, shuffle=False, collate_fn=collate_fn)

In [43]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(py_token_to_id)
OUTPUT_DIM = len(js_token_to_id)
EMB_DIM = 128
HID_DIM = 256

# Use the LSTM-based encoder and decoder defined earlier
encoder = AttnEncoder(INPUT_DIM, EMB_DIM, HID_DIM).to(DEVICE)
decoder = AttnDecoder(OUTPUT_DIM, EMB_DIM, HID_DIM).to(DEVICE)
model = AttnSeq2Seq(encoder, decoder, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=js_token_to_id['<pad>'])

In [44]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [45]:
N_EPOCHS = 30
CLIP = 1.0
best_bleu = 0
early_stopper = EarlyStopping(patience=5)

for epoch in range(N_EPOCHS):
    teacher_forcing_ratio = 0.5

    train_loss = train(model, train_loader, optimizer, criterion, CLIP, teacher_forcing_ratio)
    val_loss = evaluate(model, val_loader, criterion)
    bleu = evaluate_bleu(model, val_df, num_samples=5)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | BLEU: {bleu:.4f}")

    if bleu > best_bleu:
        best_bleu = bleu
        torch.save(model.state_dict(), "best_model.pt")  # <- fix here
        print(" Best model saved")

    early_stopper.step(val_loss)  # you are using val_loss to decide stopping
    if early_stopper.early_stop:
        print(" Early stopping triggered.")
        break


                                                                 

Example 1 BLEU: 0.0337
Example 2 BLEU: 0.0052
Example 3 BLEU: 0.8155
Example 4 BLEU: 0.3605
Example 5 BLEU: 0.7286

Avg BLEU over 5 samples: 0.3887
Epoch 1 | Train Loss: 1.7279 | Val Loss: 2.2130 | BLEU: 0.3887
 Best model saved


                                                                 

Example 1 BLEU: 0.0444
Example 2 BLEU: 0.1642
Example 3 BLEU: 0.8155
Example 4 BLEU: 0.9122
Example 5 BLEU: 0.0688

Avg BLEU over 5 samples: 0.4010
Epoch 2 | Train Loss: 1.3909 | Val Loss: 2.1552 | BLEU: 0.4010
 Best model saved


                                                                 

Example 1 BLEU: 0.0438
Example 2 BLEU: 0.1642
Example 3 BLEU: 0.6263
Example 4 BLEU: 0.9200
Example 5 BLEU: 0.9311

Avg BLEU over 5 samples: 0.5371
Epoch 3 | Train Loss: 1.3015 | Val Loss: 2.1880 | BLEU: 0.5371
 Best model saved


                                                                 

Example 1 BLEU: 0.0796
Example 2 BLEU: 0.0945
Example 3 BLEU: 0.8155
Example 4 BLEU: 0.9200
Example 5 BLEU: 0.5393

Avg BLEU over 5 samples: 0.4898
Epoch 4 | Train Loss: 1.2472 | Val Loss: 2.1961 | BLEU: 0.4898


                                                                 

Example 1 BLEU: 0.0530
Example 2 BLEU: 0.1553
Example 3 BLEU: 0.8155
Example 4 BLEU: 0.7539
Example 5 BLEU: 0.7368

Avg BLEU over 5 samples: 0.5029
Epoch 5 | Train Loss: 1.2205 | Val Loss: 2.2902 | BLEU: 0.5029


                                                                 

Example 1 BLEU: 0.0841
Example 2 BLEU: 0.1642
Example 3 BLEU: 0.8155
Example 4 BLEU: 0.9200
Example 5 BLEU: 0.4903

Avg BLEU over 5 samples: 0.4948
Epoch 6 | Train Loss: 1.2054 | Val Loss: 2.2300 | BLEU: 0.4948


                                                                 

Example 1 BLEU: 0.2234
Example 2 BLEU: 0.1642
Example 3 BLEU: 0.8155
Example 4 BLEU: 0.9122
Example 5 BLEU: 0.5533

Avg BLEU over 5 samples: 0.5337
Epoch 7 | Train Loss: 1.1966 | Val Loss: 2.2832 | BLEU: 0.5337
 Early stopping triggered.


In [47]:
load_model(model, "best_model.pt")

# Try inference
test_row = val_df.iloc[0]
src_tensor = torch.tensor(test_row['py_input_ids'], dtype=torch.long)
predicted_tokens = beam_search(model, src_tensor)
print("Predicted:", " ".join(predicted_tokens))
print("Reference:", " ".join(test_row['js_tokens']))

  model.load_state_dict(torch.load(path))


Predicted: <unk> ( str1 ) { <unk> <unk> = ( * <unk> <unk> / <unk> <unk> <unk> <unk> <unk> <unk> ; . . ( <unk> , 2 ) ) ; ;
Reference: function Conversion ( centi ) { let pixels = ( 96 * centi ) / 2.54 ; document . write ( pixels ) ; return 0 ; }
