## RNN

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import os

df = pd.read_csv('python_to_valid_cpp.csv')

## Data Preprocesing

In [25]:
SPECIAL_TOKENS = ['<pad>', '<sos>', '<eos>', '<unk>']

def tokenize_code(code):
    return re.findall(r"[\w]+|[^\s\w]", code)

def build_vocab(token_lists, max_vocab_size=10000):
    all_tokens = list(chain.from_iterable(token_lists))
    token_counts = Counter(all_tokens)
    most_common = token_counts.most_common(max_vocab_size - len(SPECIAL_TOKENS))
    vocab = SPECIAL_TOKENS + [token for token, _ in most_common]
    token_to_id = {token: idx for idx, token in enumerate(vocab)}
    id_to_token = {idx: token for token, idx in token_to_id.items()}
    return token_to_id, id_to_token

def tokens_to_ids(tokens, token_to_id, add_sos=False, add_eos=False):
    ids = [token_to_id.get(token, token_to_id['<unk>']) for token in tokens]
    if add_sos:
        ids = [token_to_id['<sos>']] + ids
    if add_eos:
        ids += [token_to_id['<eos>']]
    return ids

### Dataset and Dataloader

In [26]:
class CodeTranslationDataset(Dataset):
    def __init__(self, df):
        self.encoder_inputs = df['py_tensor'].tolist()
        self.decoder_inputs = df['cpp_in_tensor'].tolist()
        self.decoder_outputs = df['cpp_out_tensor'].tolist()

    def __len__(self):
        return len(self.encoder_inputs)

    def __getitem__(self, idx):
        return self.encoder_inputs[idx], self.decoder_inputs[idx], self.decoder_outputs[idx]

def collate_fn(batch):
    enc_inputs, dec_inputs, dec_outputs = zip(*batch)
    enc_inputs_pad = pad_sequence(enc_inputs, batch_first=True, padding_value=py_token_to_id['<pad>'])
    dec_inputs_pad = pad_sequence(dec_inputs, batch_first=True, padding_value=cpp_token_to_id['<pad>'])
    dec_outputs_pad = pad_sequence(dec_outputs, batch_first=True, padding_value=cpp_token_to_id['<pad>'])
    return enc_inputs_pad, dec_inputs_pad, dec_outputs_pad

### Attention-based Model

In [27]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Parameter(torch.rand(hid_dim))

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return torch.softmax(attention, dim=1)

class AttnDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.gru = nn.GRU(hid_dim + emb_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attn = Attention(hid_dim)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        a = self.attn(hidden.squeeze(0), encoder_outputs).unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.gru(rnn_input, hidden)
        prediction = self.fc_out(torch.cat((output.squeeze(1), weighted.squeeze(1)), dim=1))
        return prediction, hidden

class AttnEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.gru = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

class AttnSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.embedding.num_embeddings
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs


## Training Setup

In [38]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def step(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

def train(model, dataloader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for src, trg_in, trg_out in tqdm(dataloader, desc="Training", leave=False):
        src, trg_in, trg_out = src.to(DEVICE), trg_in.to(DEVICE), trg_out.to(DEVICE)
        optimizer.zero_grad()
        output = model(src, trg_in)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg_out = trg_out[:, 1:].reshape(-1)
        loss = criterion(output, trg_out)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        torch.cuda.empty_cache()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg_in, trg_out in dataloader:
            src, trg_in, trg_out = src.to(DEVICE), trg_in.to(DEVICE), trg_out.to(DEVICE)
            output = model(src, trg_in)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg_out = trg_out[:, 1:].reshape(-1)
            loss = criterion(output, trg_out)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)


### Beam Search & BLEU

In [39]:
def beam_search(model, src_tensor, beam_width=3, max_len=300):
    model.eval()
    src_tensor = src_tensor.unsqueeze(0).to(DEVICE)
    encoder_outputs, hidden = model.encoder(src_tensor)
    sequences = [[[], 0.0, torch.tensor([cpp_token_to_id['<sos>']], device=DEVICE), hidden]]
    for _ in range(max_len):
        all_candidates = []
        for seq, score, last_token, hidden in sequences:
            with torch.no_grad():
                output, hidden = model.decoder(last_token, hidden, encoder_outputs)
            probs = torch.log_softmax(output, dim=1)
            topk = torch.topk(probs, beam_width)
            for i in range(beam_width):
                token = topk.indices[0][i]
                prob = topk.values[0][i].item()
                candidate = [seq + [token.item()], score + prob, token.unsqueeze(0), hidden]
                all_candidates.append(candidate)
        sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]
        if all(seq[0][-1] == cpp_token_to_id['<eos>'] for seq in sequences):
            break
    return [cpp_id_to_token.get(idx, '<unk>') for idx in sequences[0][0] if idx != cpp_token_to_id['<eos>']]

def compute_bleu(pred_tokens, ref_tokens):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)

def evaluate_bleu(model, val_df, num_samples=5):
    total_bleu = 0
    for i in range(min(num_samples, len(val_df))):
        row = val_df.iloc[i]
        src_tensor = torch.tensor(row['py_input_ids'], dtype=torch.long)
        pred = beam_search(model, src_tensor)
        ref = row['cpp_tokens']
        bleu = compute_bleu(pred, ref)
        total_bleu += bleu
        print(f"Example {i+1} BLEU: {bleu:.4f}")
    avg_bleu = total_bleu / num_samples
    print(f"\nAvg BLEU over {num_samples} samples: {avg_bleu:.4f}")
    return avg_bleu

def save_model(model, path="best_model.pt"):
    torch.save(model.state_dict(), path)

def load_model(model, path="best_model.pt"):
    model.load_state_dict(torch.load(path))
    model.eval()

## Training

In [40]:
df = pd.read_csv("python_to_valid_cpp.csv")

df['py_tokens'] = df['code'].apply(tokenize_code)
df['cpp_tokens'] = df['cpp_code'].apply(tokenize_code)

py_token_to_id, py_id_to_token = build_vocab(df['py_tokens'])
cpp_token_to_id, cpp_id_to_token = build_vocab(df['cpp_tokens'])

df['py_input_ids'] = df['py_tokens'].apply(lambda tokens: tokens_to_ids(tokens, py_token_to_id))
df['cpp_input_ids'] = df['cpp_tokens'].apply(lambda tokens: tokens_to_ids(tokens, cpp_token_to_id, add_sos=True))
df['cpp_output_ids'] = df['cpp_tokens'].apply(lambda tokens: tokens_to_ids(tokens, cpp_token_to_id, add_eos=True))

df['py_tensor'] = df['py_input_ids'].apply(lambda x: torch.tensor(x, dtype=torch.long))
df['cpp_in_tensor'] = df['cpp_input_ids'].apply(lambda x: torch.tensor(x, dtype=torch.long))
df['cpp_out_tensor'] = df['cpp_output_ids'].apply(lambda x: torch.tensor(x, dtype=torch.long))

MAX_LEN = 300
df = df[df['py_input_ids'].apply(len) <= MAX_LEN]
df = df[df['cpp_input_ids'].apply(len) <= MAX_LEN]
df = df[df['cpp_output_ids'].apply(len) <= MAX_LEN]


In [50]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_loader = DataLoader(CodeTranslationDataset(train_df), batch_size=1, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(CodeTranslationDataset(val_df), batch_size=1, shuffle=False, collate_fn=collate_fn)


In [53]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(py_token_to_id)
OUTPUT_DIM = len(cpp_token_to_id)
EMB_DIM = 128
HID_DIM = 256

encoder = AttnEncoder(INPUT_DIM, EMB_DIM, HID_DIM).to(DEVICE)
decoder = AttnDecoder(OUTPUT_DIM, EMB_DIM, HID_DIM).to(DEVICE)
model = AttnSeq2Seq(encoder, decoder, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=cpp_token_to_id['<pad>'])


In [54]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [55]:
N_EPOCHS = 30
CLIP = 1.0
best_bleu = 0
early_stopper = EarlyStopping(patience=5)

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    val_loss = evaluate(model, val_loader, criterion)
    bleu = evaluate_bleu(model, val_df, num_samples=5)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | BLEU: {bleu:.4f}")

    if bleu > best_bleu:
        best_bleu = bleu
        save_model(model, "best_model.pt")
        print(" Best model saved")

    early_stopper.step(val_loss)
    if early_stopper.early_stop:
        print(" Early stopping triggered.")
        break




Example 1 BLEU: 0.0252
Example 2 BLEU: 0.0195
Example 3 BLEU: 0.0089
Example 4 BLEU: 0.0071
Example 5 BLEU: 0.0095

Avg BLEU over 5 samples: 0.0140
Epoch 1 | Train Loss: 4.1988 | Val Loss: 3.7532 | BLEU: 0.0140
 Best model saved!




Example 1 BLEU: 0.0643
Example 2 BLEU: 0.0662
Example 3 BLEU: 0.0189
Example 4 BLEU: 0.0909
Example 5 BLEU: 0.0287

Avg BLEU over 5 samples: 0.0538
Epoch 2 | Train Loss: 3.3745 | Val Loss: 3.3667 | BLEU: 0.0538
 Best model saved!




Example 1 BLEU: 0.0747
Example 2 BLEU: 0.0779
Example 3 BLEU: 0.0570
Example 4 BLEU: 0.0615
Example 5 BLEU: 0.0388

Avg BLEU over 5 samples: 0.0620
Epoch 3 | Train Loss: 2.8607 | Val Loss: 3.1166 | BLEU: 0.0620
 Best model saved!




Example 1 BLEU: 0.1149
Example 2 BLEU: 0.1021
Example 3 BLEU: 0.0575
Example 4 BLEU: 0.3992
Example 5 BLEU: 0.0594

Avg BLEU over 5 samples: 0.1466
Epoch 4 | Train Loss: 2.5055 | Val Loss: 2.8949 | BLEU: 0.1466
 Best model saved!




Example 1 BLEU: 0.2612
Example 2 BLEU: 0.1348
Example 3 BLEU: 0.1025
Example 4 BLEU: 0.4146
Example 5 BLEU: 0.0533

Avg BLEU over 5 samples: 0.1933
Epoch 5 | Train Loss: 2.2148 | Val Loss: 2.7866 | BLEU: 0.1933
 Best model saved!




Example 1 BLEU: 0.1706
Example 2 BLEU: 0.3756
Example 3 BLEU: 0.1022
Example 4 BLEU: 0.3694
Example 5 BLEU: 0.0684

Avg BLEU over 5 samples: 0.2172
Epoch 6 | Train Loss: 2.0026 | Val Loss: 2.6750 | BLEU: 0.2172
 Best model saved!




Example 1 BLEU: 0.2080
Example 2 BLEU: 0.3667
Example 3 BLEU: 0.1215
Example 4 BLEU: 0.3820
Example 5 BLEU: 0.1898

Avg BLEU over 5 samples: 0.2536
Epoch 7 | Train Loss: 1.8373 | Val Loss: 2.5011 | BLEU: 0.2536
 Best model saved!




Example 1 BLEU: 0.3413
Example 2 BLEU: 0.3438
Example 3 BLEU: 0.2906
Example 4 BLEU: 0.5048
Example 5 BLEU: 0.1719

Avg BLEU over 5 samples: 0.3305
Epoch 8 | Train Loss: 1.6913 | Val Loss: 2.5115 | BLEU: 0.3305
 Best model saved!




Example 1 BLEU: 0.3674
Example 2 BLEU: 0.4085
Example 3 BLEU: 0.3818
Example 4 BLEU: 0.5050
Example 5 BLEU: 0.2597

Avg BLEU over 5 samples: 0.3845
Epoch 9 | Train Loss: 1.6071 | Val Loss: 2.3815 | BLEU: 0.3845
 Best model saved!




Example 1 BLEU: 0.4738
Example 2 BLEU: 0.4918
Example 3 BLEU: 0.3172
Example 4 BLEU: 0.4895
Example 5 BLEU: 0.1749

Avg BLEU over 5 samples: 0.3894
Epoch 10 | Train Loss: 1.5095 | Val Loss: 2.3067 | BLEU: 0.3894
 Best model saved!




Example 1 BLEU: 0.4707
Example 2 BLEU: 0.4792
Example 3 BLEU: 0.4023
Example 4 BLEU: 0.6541
Example 5 BLEU: 0.3012

Avg BLEU over 5 samples: 0.4615
Epoch 11 | Train Loss: 1.4468 | Val Loss: 2.2540 | BLEU: 0.4615
 Best model saved!




Example 1 BLEU: 0.5151
Example 2 BLEU: 0.5617
Example 3 BLEU: 0.4563
Example 4 BLEU: 0.6470
Example 5 BLEU: 0.4423

Avg BLEU over 5 samples: 0.5245
Epoch 12 | Train Loss: 1.3804 | Val Loss: 2.3051 | BLEU: 0.5245
 Best model saved!




Example 1 BLEU: 0.4302
Example 2 BLEU: 0.7230
Example 3 BLEU: 0.4842
Example 4 BLEU: 0.6393
Example 5 BLEU: 0.4341

Avg BLEU over 5 samples: 0.5422
Epoch 13 | Train Loss: 1.3254 | Val Loss: 2.1484 | BLEU: 0.5422
 Best model saved!




Example 1 BLEU: 0.5233
Example 2 BLEU: 0.6029
Example 3 BLEU: 0.3759
Example 4 BLEU: 0.5931
Example 5 BLEU: 0.3702

Avg BLEU over 5 samples: 0.4931
Epoch 14 | Train Loss: 1.3156 | Val Loss: 2.1338 | BLEU: 0.4931




Example 1 BLEU: 0.4480
Example 2 BLEU: 0.4797
Example 3 BLEU: 0.4130
Example 4 BLEU: 0.7064
Example 5 BLEU: 0.2221

Avg BLEU over 5 samples: 0.4539
Epoch 15 | Train Loss: 1.2908 | Val Loss: 2.1397 | BLEU: 0.4539




Example 1 BLEU: 0.5207
Example 2 BLEU: 0.5175
Example 3 BLEU: 0.5042
Example 4 BLEU: 0.8562
Example 5 BLEU: 0.1843

Avg BLEU over 5 samples: 0.5166
Epoch 16 | Train Loss: 1.2595 | Val Loss: 2.1922 | BLEU: 0.5166




Example 1 BLEU: 0.4714
Example 2 BLEU: 0.6701
Example 3 BLEU: 0.4489
Example 4 BLEU: 0.7991
Example 5 BLEU: 0.4577

Avg BLEU over 5 samples: 0.5694
Epoch 17 | Train Loss: 1.2866 | Val Loss: 2.2562 | BLEU: 0.5694
 Best model saved!




Example 1 BLEU: 0.4686
Example 2 BLEU: 0.5550
Example 3 BLEU: 0.2141
Example 4 BLEU: 0.8036
Example 5 BLEU: 0.3311

Avg BLEU over 5 samples: 0.4745
Epoch 18 | Train Loss: 1.2365 | Val Loss: 2.0532 | BLEU: 0.4745




Example 1 BLEU: 0.5351
Example 2 BLEU: 0.5009
Example 3 BLEU: 0.4559
Example 4 BLEU: 0.9086
Example 5 BLEU: 0.5536

Avg BLEU over 5 samples: 0.5908
Epoch 19 | Train Loss: 1.2340 | Val Loss: 2.1994 | BLEU: 0.5908
 Best model saved!




Example 1 BLEU: 0.6240
Example 2 BLEU: 0.6504
Example 3 BLEU: 0.2393
Example 4 BLEU: 0.8522
Example 5 BLEU: 0.2588

Avg BLEU over 5 samples: 0.5249
Epoch 20 | Train Loss: 1.2207 | Val Loss: 2.2909 | BLEU: 0.5249




Example 1 BLEU: 0.6671
Example 2 BLEU: 0.5888
Example 3 BLEU: 0.2534
Example 4 BLEU: 0.8508
Example 5 BLEU: 0.3851

Avg BLEU over 5 samples: 0.5490
Epoch 21 | Train Loss: 1.2904 | Val Loss: 2.4089 | BLEU: 0.5490




Example 1 BLEU: 0.4820
Example 2 BLEU: 0.2683
Example 3 BLEU: 0.4793
Example 4 BLEU: 0.7308
Example 5 BLEU: 0.1408

Avg BLEU over 5 samples: 0.4203
Epoch 22 | Train Loss: 1.3359 | Val Loss: 2.1708 | BLEU: 0.4203




Example 1 BLEU: 0.5800
Example 2 BLEU: 0.6591
Example 3 BLEU: 0.4852
Example 4 BLEU: 0.7998
Example 5 BLEU: 0.3080

Avg BLEU over 5 samples: 0.5664
Epoch 23 | Train Loss: 1.2586 | Val Loss: 2.0495 | BLEU: 0.5664




Example 1 BLEU: 0.3947
Example 2 BLEU: 0.5643
Example 3 BLEU: 0.4679
Example 4 BLEU: 0.8204
Example 5 BLEU: 0.6732

Avg BLEU over 5 samples: 0.5841
Epoch 24 | Train Loss: 1.2374 | Val Loss: 2.1887 | BLEU: 0.5841




Example 1 BLEU: 0.4800
Example 2 BLEU: 0.7080
Example 3 BLEU: 0.3780
Example 4 BLEU: 0.7982
Example 5 BLEU: 0.5595

Avg BLEU over 5 samples: 0.5847
Epoch 25 | Train Loss: 1.3231 | Val Loss: 2.3065 | BLEU: 0.5847




Example 1 BLEU: 0.6138
Example 2 BLEU: 0.6414
Example 3 BLEU: 0.4360
Example 4 BLEU: 0.8880
Example 5 BLEU: 0.1625

Avg BLEU over 5 samples: 0.5483
Epoch 26 | Train Loss: 1.3028 | Val Loss: 2.3478 | BLEU: 0.5483




Example 1 BLEU: 0.5312
Example 2 BLEU: 0.6677
Example 3 BLEU: 0.4108
Example 4 BLEU: 0.8192
Example 5 BLEU: 0.1515

Avg BLEU over 5 samples: 0.5161
Epoch 27 | Train Loss: 1.3266 | Val Loss: 2.2137 | BLEU: 0.5161




Example 1 BLEU: 0.6639
Example 2 BLEU: 0.7565
Example 3 BLEU: 0.5006
Example 4 BLEU: 0.8100
Example 5 BLEU: 0.1718

Avg BLEU over 5 samples: 0.5806
Epoch 28 | Train Loss: 1.2355 | Val Loss: 2.3721 | BLEU: 0.5806
 Early stopping triggered.


In [56]:
load_model(model, "best_model.pt")

# Try inference
test_row = val_df.iloc[0]
src_tensor = torch.tensor(test_row['py_input_ids'], dtype=torch.long)
predicted_tokens = beam_search(model, src_tensor)
print("Predicted:", " ".join(predicted_tokens))
print("Reference:", " ".join(test_row['cpp_tokens']))


Predicted: maxChunksToSorted ( auto A ) { { , gcd , mx , mn = [ 0 ] * len ( A ) , [ 0 ] * len ( A ) , - 0 float ( inf " ) for i , in enumerate num enumerate ( ) : mx = max ( mx , num ) ) [ i ] = mx for i in range ( len ( A A ) i 1 ) : = = ( mx , , [ i ] ) mx [ i ] = mn for in range ( len ( ( A A ) i 1 ) : : = min ( mn , [ i [ ] ) mx [ i ] = [ for in range ( len ( ( A A ) i 1 ) : if = [ i ] < = [ [ i + 1 ] return i + 1 }
Reference: auto partitionDisjoint ( auto A ) { rMin , lMax , mx , mn = [ 0 ] * len ( A ) , [ 0 ] * len ( A ) , - float ( " inf " ) , float ( " inf " ) for i , num in enumerate ( A ) : mx = max ( mx , num ) lMax [ i ] = mx for i in range ( len ( A ) - 1 , - 1 , - 1 ) : mn = min ( mn , A [ i ] ) rMin [ i ] = mn for i in range ( len ( A ) - 1 ) : if lMax [ i ] < = rMin [ i + 1 ] : return i + 1 }
