# Load Data

In [4]:
from pathlib import Path

BASE_DIR = Path("data/generation/pair_data_tok_1/Python-Javascript")
TRAIN_SRC = BASE_DIR / "train-Python-Javascript-tok.py"
TRAIN_TGT = BASE_DIR / "train-Python-Javascript-tok.js"
TRAIN_SRC_MAP = BASE_DIR / "train-Python-map.jsonl"
TRAIN_TGT_MAP = BASE_DIR / "train-Javascript-map.jsonl"

VAL_SRC = BASE_DIR / "val-Python-Javascript-tok.py"
VAL_TGT = BASE_DIR / "val-Python-Javascript-tok.js"
VAL_SRC_MAP = BASE_DIR / "val-Python-map.jsonl"
VAL_TGT_MAP = BASE_DIR / "val-Javascript-map.jsonl"

In [5]:
def load_pairs(src_file, tgt_file, src_map_file, tgt_map_file):
    src_lines = [ln.strip() for ln in open(src_file, encoding="utf-8") if ln.strip()]
    tgt_lines = [ln.strip() for ln in open(tgt_file, encoding="utf-8") if ln.strip()]
    # maps are one-to-one but not used for modeling; kept for reference
    src_ids = [ln.strip() for ln in open(src_map_file, encoding="utf-8")]
    tgt_ids = [ln.strip() for ln in open(tgt_map_file, encoding="utf-8")]
    assert len(src_lines) == len(tgt_lines)
    return list(zip(src_lines, tgt_lines))

train_pairs = load_pairs(TRAIN_SRC, TRAIN_TGT, TRAIN_SRC_MAP, TRAIN_TGT_MAP)
val_pairs = load_pairs(VAL_SRC, VAL_TGT, VAL_SRC_MAP, VAL_TGT_MAP)

In [6]:
from torch.utils.data import Dataset, DataLoader

class CodeTranslationDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_len=256):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        enc = self.tokenizer(src, truncation=True,
                             padding="max_length",
                             max_length=self.max_len,
                             return_tensors="pt")
        dec = self.tokenizer(tgt, truncation=True,
                             padding="max_length",
                             max_length=self.max_len,
                             return_tensors="pt")
        input_ids = enc.input_ids.squeeze()
        attn_mask = enc.attention_mask.squeeze()
        labels = dec.input_ids.squeeze()
        # replace pad token id in labels by -100 for CE ignore
        labels[labels == tokenizer.pad_token_id] = -100
        return {
            "input_ids": input_ids,
            "attention_mask": attn_mask,
            "labels": labels
        }


In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-small")
# ensure padding token exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

train_ds = CodeTranslationDataset(train_pairs, tokenizer)
val_ds = CodeTranslationDataset(val_pairs, tokenizer)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [9]:
# Cell 7 — DataLoaders
BATCH_SIZE = 8

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds,   batch_size=BATCH_SIZE)


# Baseline Pretrained Transformer

In [8]:
from torch.optim import AdamW
from transformers import T5ForConditionalGeneration
import torch

LR = 5e-5
EPOCHS = 3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small")
model.resize_token_embeddings(len(tokenizer))  # in case we added PAD
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LR)


2025-05-25 00:29:38.417540: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-25 00:29:38.591443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748150978.654669   85444 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748150978.673894   85444 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748150978.820336   85444 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

## Train

In [11]:
# Cell 8 — Training Loop with tqdm
from tqdm import tqdm

for epoch in range(EPOCHS):
    # —— Training
    model.train()
    train_loss = 0.0
    train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [train]")
    for batch in train_bar:
        optimizer.zero_grad()
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["labels"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        avg_train = train_loss / (train_bar.n + 1)
        train_bar.set_postfix(loss=f"{avg_train:.4f}")

    # —— Validation
    model.eval()
    val_loss = 0.0
    val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [val] ")
    with torch.no_grad():
        for batch in val_bar:
            input_ids      = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels         = batch["labels"].to(DEVICE)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            val_loss += loss.item()
            avg_val = val_loss / (val_bar.n + 1)
            val_bar.set_postfix(val_loss=f"{avg_val:.4f}")


Epoch 1/3 [train]: 100%|██████████| 8403/8403 [10:37<00:00, 13.18it/s, loss=0.2078]
Epoch 1/3 [val] : 100%|██████████| 469/469 [00:15<00:00, 30.34it/s, val_loss=0.2094]
Epoch 2/3 [train]: 100%|██████████| 8403/8403 [10:42<00:00, 13.08it/s, loss=0.1815]
Epoch 2/3 [val] : 100%|██████████| 469/469 [00:15<00:00, 30.33it/s, val_loss=0.1996]
Epoch 3/3 [train]: 100%|██████████| 8403/8403 [10:45<00:00, 13.02it/s, loss=0.1647]
Epoch 3/3 [val] : 100%|██████████| 469/469 [00:15<00:00, 30.06it/s, val_loss=0.1935]


In [12]:
import os

output_dir = "codet5-python-to-js"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to codet5-python-to-js


In [14]:
model.eval()
example_py = """
def debug(x: int):
    print("Printing: ", x):
"""
inputs = tokenizer(example_py, return_tensors="pt").to(DEVICE)
gen = model.generate(**inputs, max_length=64)
print("JS translation:\n", tokenizer.decode(gen[0], skip_special_tokens=True))

JS translation:
 function debug ( x ) { document . write ( " " + x ) ; }


# Baseline Untrained Transformer

In [19]:
# Cell 7 — Build a BERT-style encoder–decoder from scratch
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

# 1) Encoder configuration
encoder_cfg = BertConfig(
    vocab_size=len(tokenizer),
    hidden_size=512,
    num_hidden_layers=6,
    num_attention_heads=8,
    intermediate_size=2048,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    pad_token_id=tokenizer.pad_token_id,
)

# 2) Decoder configuration (enable cross-attention & decoder mode)
decoder_cfg = BertConfig(
    vocab_size=len(tokenizer),
    hidden_size=512,
    num_hidden_layers=6,
    num_attention_heads=8,
    intermediate_size=2048,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    is_decoder=True,
    add_cross_attention=True,
    pad_token_id=tokenizer.pad_token_id,
)

# 3) Combine into an EncoderDecoderConfig
config = EncoderDecoderConfig.from_encoder_decoder_configs(
    encoder_cfg,
    decoder_cfg,
)

# … after EncoderDecoderConfig.from_encoder_decoder_configs(...)
config.decoder_start_token_id = tokenizer.cls_token_id
config.eos_token_id           = tokenizer.sep_token_id
config.pad_token_id           = tokenizer.pad_token_id

model = EncoderDecoderModel(config)
model.to(DEVICE)


# 5) Optimizer
optimizer = AdamW(model.parameters(), lr=LR)


Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32100
}

Config of the decoder: <class 'transformers.models.bert.modeling_bert.BertLMHeadModel'> is overwritten by shared decoder config: BertConfig {
  "add_cross_attention": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "inte

In [23]:
from tqdm import tqdm

for epoch in range(EPOCHS * 3):
    # —— Training
    model.train()
    train_loss = 0.0
    train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [train]")
    for batch in train_bar:
        optimizer.zero_grad()
        input_ids      = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels         = batch["labels"].to(DEVICE)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        avg_train = train_loss / (train_bar.n + 1)
        train_bar.set_postfix(loss=f"{avg_train:.4f}")

    # —— Validation
    model.eval()
    val_loss = 0.0
    val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [val] ")
    with torch.no_grad():
        for batch in val_bar:
            input_ids      = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels         = batch["labels"].to(DEVICE)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            val_loss += loss.item()
            avg_val = val_loss / (val_bar.n + 1)
            val_bar.set_postfix(val_loss=f"{avg_val:.4f}")

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
Epoch 1/3 [train]: 100%|██████████| 8403/8403 [11:22<00:00, 12.31it/s, loss=0.3432]
Epoch 1/3 [val] : 100%|██████████| 469/469 [00:18<00:00, 24.78it/s, val_loss=0.4209]
Epoch 2/3 [train]: 100%|██████████| 8403/8403 [11:12<00:00, 12.49it/s, loss=0.2897]
Epoch 2/3 [val] : 100%|██████████| 469/469 [00:18<00:00, 24.69it/s, val_loss=0.3916]
Epoch 3/3 [train]: 100%|██████████| 8403/8403 [11:14<00:00, 12.46it/s, loss=0.2509]
Epoch 3/3 [val] : 100%|██████████| 469/469 [00:19<00:00, 24.57it/s, val_loss=0.3889]
Epoch 4/3 [train]: 100%|██████████| 8403/8403 [11:16<00:00, 12.42it/s, loss=0.2220]
Epoch 4/3 [val] : 100%|██████████| 469/469 [00:19<00:00, 24.45it/s, val_loss=0.3795]
Epoch 5/3 [train]: 100%|██████████| 8403/8403 [11:19<00:00, 12.37it/s, loss=0.1987]
Epoch 5/3 [val] : 100%|██████████| 469/469 [00:19<00:00, 24.27it/s, val_loss=0.3741]
Epoch 6/3 [train]: 100%|██████████| 8403/8403 [11:20

In [29]:
import os

output_dir = "bert-python-to-js_2"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to bert-python-to-js_2


In [42]:

model.eval()
example_py = "def debug(x): print(x)"
inputs = tokenizer(example_py, return_tensors="pt").to(DEVICE)
gen = model.generate(**inputs, max_length=64)
print("JS translation:\n", tokenizer.decode(gen[0], skip_special_tokens=True))

JS translation:
 function panx(1 ) { let temx = 0 ; let temx = 0 ; let m = 0 ; for ( let i = 0 ; i < n ; i ++ ) { temx = m ; } document . write ( temx + " " + temx + " " ) ;
