## LSTM

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import os

# Load dataset
df = pd.read_csv('python_to_valid_cpp.csv')

## Data Preprocesing using the Tree_sitter tokenizer

In [2]:
# Load Tree-sitter tokenizer
from tree_sitter import Language, Parser

# Load precompiled language library (make sure it's already built)
PY_LANGUAGE = Language("build/my-languages.dll", "python")
CPP_LANGUAGE = Language("build/my-languages.dll", "cpp")

def tree_sitter_tokenize(code: str, language: str = "python") -> list:
    parser = Parser()
    parser.set_language(PY_LANGUAGE if language == "python" else CPP_LANGUAGE)
    tree = parser.parse(bytes(code, "utf8"))
    root = tree.root_node

    tokens = []
    def extract_tokens(node):
        if node.child_count == 0:
            tokens.append(code[node.start_byte:node.end_byte])
        for child in node.children:
            extract_tokens(child)
    extract_tokens(root)
    return tokens

In [3]:
SPECIAL_TOKENS = ['<pad>', '<sos>', '<eos>', '<unk>']

# Wrapper to make tokenizer API consistent
def tokenize_code(code, lang="python"):
    return tree_sitter_tokenize(code, language=lang)

def build_vocab(token_lists, max_vocab_size=10000):
    all_tokens = list(chain.from_iterable(token_lists))
    token_counts = Counter(all_tokens)
    most_common = token_counts.most_common(max_vocab_size - len(SPECIAL_TOKENS))
    vocab = SPECIAL_TOKENS + [token for token, _ in most_common]
    token_to_id = {token: idx for idx, token in enumerate(vocab)}
    id_to_token = {idx: token for token, idx in token_to_id.items()}
    return token_to_id, id_to_token

def tokens_to_ids(tokens, token_to_id, add_sos=False, add_eos=False):
    ids = [token_to_id.get(token, token_to_id['<unk>']) for token in tokens]
    if add_sos:
        ids = [token_to_id['<sos>']] + ids
    if add_eos:
        ids += [token_to_id['<eos>']]
    return ids

### Dataset and Dataloader

In [4]:
class CodeTranslationDataset(Dataset):
    def __init__(self, df):
        self.encoder_inputs = df['py_tensor'].tolist()
        self.decoder_inputs = df['cpp_in_tensor'].tolist()
        self.decoder_outputs = df['cpp_out_tensor'].tolist()

    def __len__(self):
        return len(self.encoder_inputs)

    def __getitem__(self, idx):
        return self.encoder_inputs[idx], self.decoder_inputs[idx], self.decoder_outputs[idx]

def collate_fn(batch):
    enc_inputs, dec_inputs, dec_outputs = zip(*batch)
    enc_inputs_pad = pad_sequence(enc_inputs, batch_first=True, padding_value=py_token_to_id['<pad>'])
    dec_inputs_pad = pad_sequence(dec_inputs, batch_first=True, padding_value=cpp_token_to_id['<pad>'])
    dec_outputs_pad = pad_sequence(dec_outputs, batch_first=True, padding_value=cpp_token_to_id['<pad>'])
    return enc_inputs_pad, dec_inputs_pad, dec_outputs_pad

### Attention-based LSTM Model

In [5]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Parameter(torch.rand(hid_dim))

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.sum(self.v * energy, dim=2)
        return torch.softmax(attention, dim=1)


class AttnDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(hid_dim + emb_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attn = Attention(hid_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        a = self.attn(hidden[-1], encoder_outputs).unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(1), weighted.squeeze(1)), dim=1))
        return prediction, hidden, cell


class AttnEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell


class AttnSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.embedding.num_embeddings

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs


## Training Setup

In [6]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def step(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

def train(model, dataloader, optimizer, criterion, clip, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0
    for src, trg_in, trg_out in tqdm(dataloader, desc="Training", leave=False):
        src, trg_in, trg_out = src.to(DEVICE), trg_in.to(DEVICE), trg_out.to(DEVICE)
        optimizer.zero_grad()
        
        # Pass the teacher forcing ratio to the model
        output = model(src, trg_in, teacher_forcing_ratio)
        
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg_out = trg_out[:, 1:].reshape(-1)
        loss = criterion(output, trg_out)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        torch.cuda.empty_cache()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg_in, trg_out in dataloader:
            src, trg_in, trg_out = src.to(DEVICE), trg_in.to(DEVICE), trg_out.to(DEVICE)
            output = model(src, trg_in, teacher_forcing_ratio=0.0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg_out = trg_out[:, 1:].reshape(-1)
            loss = criterion(output, trg_out)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)


### Beam Search & BLEU

In [7]:
def beam_search(model, src_tensor, beam_width=3, max_len=300):
    model.eval()
    src_tensor = src_tensor.unsqueeze(0).to(DEVICE)

    # Encode the source
    encoder_outputs, hidden, cell = model.encoder(src_tensor)

    # Start with <sos> token
    sequences = [[[], 0.0, torch.tensor([cpp_token_to_id['<sos>']], device=DEVICE), hidden, cell]]

    for _ in range(max_len):
        all_candidates = []

        for seq, score, last_token, hidden, cell in sequences:
            with torch.no_grad():
                output, hidden, cell = model.decoder(last_token, hidden, cell, encoder_outputs)

            probs = torch.log_softmax(output, dim=1)
            topk = torch.topk(probs, beam_width)

            for i in range(beam_width):
                token = topk.indices[0][i]
                prob = topk.values[0][i].item()

                candidate = [seq + [token.item()], score + prob, token.unsqueeze(0), hidden, cell]
                all_candidates.append(candidate)

        sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]

        if all(seq[0][-1] == cpp_token_to_id['<eos>'] for seq in sequences):
            break

    return [cpp_id_to_token.get(idx, '<unk>') for idx in sequences[0][0] if idx != cpp_token_to_id['<eos>']]


def compute_bleu(pred_tokens, ref_tokens):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)

def evaluate_bleu(model, val_df, num_samples=5):
    total_bleu = 0
    for i in range(min(num_samples, len(val_df))):
        row = val_df.iloc[i]
        src_tensor = torch.tensor(row['py_input_ids'], dtype=torch.long)
        pred = beam_search(model, src_tensor)
        ref = row['cpp_tokens']
        bleu = compute_bleu(pred, ref)
        total_bleu += bleu
        print(f"Example {i+1} BLEU: {bleu:.4f}")
    avg_bleu = total_bleu / num_samples
    print(f"\nAvg BLEU over {num_samples} samples: {avg_bleu:.4f}")
    return avg_bleu

def save_model(model, path="best_model.pt"):
    torch.save(model.state_dict(), path)

def load_model(model, path="best_model.pt"):
    model.load_state_dict(torch.load(path))
    model.eval()

## Training

In [8]:
df = pd.read_csv("python_to_valid_cpp.csv")

# Use Tree-sitter tokenizer explicitly for each language
df['py_tokens'] = df['code'].apply(lambda x: tokenize_code(x, lang="python"))
df['cpp_tokens'] = df['cpp_code'].apply(lambda x: tokenize_code(x, lang="cpp"))

# Build vocabularies
py_token_to_id, py_id_to_token = build_vocab(df['py_tokens'])
cpp_token_to_id, cpp_id_to_token = build_vocab(df['cpp_tokens'])

# Convert tokens to input/output ID sequences
df['py_input_ids'] = df['py_tokens'].apply(lambda tokens: tokens_to_ids(tokens, py_token_to_id))
df['cpp_input_ids'] = df['cpp_tokens'].apply(lambda tokens: tokens_to_ids(tokens, cpp_token_to_id, add_sos=True))
df['cpp_output_ids'] = df['cpp_tokens'].apply(lambda tokens: tokens_to_ids(tokens, cpp_token_to_id, add_eos=True))

# Convert to PyTorch tensors
df['py_tensor'] = df['py_input_ids'].apply(lambda x: torch.tensor(x, dtype=torch.long))
df['cpp_in_tensor'] = df['cpp_input_ids'].apply(lambda x: torch.tensor(x, dtype=torch.long))
df['cpp_out_tensor'] = df['cpp_output_ids'].apply(lambda x: torch.tensor(x, dtype=torch.long))

# Filter out long sequences
MAX_LEN = 300
df = df[df['py_input_ids'].apply(len) <= MAX_LEN]
df = df[df['cpp_input_ids'].apply(len) <= MAX_LEN]
df = df[df['cpp_output_ids'].apply(len) <= MAX_LEN]

#### Train-validation split

In [9]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_loader = DataLoader(CodeTranslationDataset(train_df), batch_size=1, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(CodeTranslationDataset(val_df), batch_size=1, shuffle=False, collate_fn=collate_fn)


In [10]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_DIM = len(py_token_to_id)
OUTPUT_DIM = len(cpp_token_to_id)
EMB_DIM = 128
HID_DIM = 256

# Use the LSTM-based encoder and decoder defined earlier
encoder = AttnEncoder(INPUT_DIM, EMB_DIM, HID_DIM).to(DEVICE)
decoder = AttnDecoder(OUTPUT_DIM, EMB_DIM, HID_DIM).to(DEVICE)
model = AttnSeq2Seq(encoder, decoder, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=cpp_token_to_id['<pad>'])


In [11]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [12]:
N_EPOCHS = 30
CLIP = 1.0
best_bleu = 0
early_stopper = EarlyStopping(patience=5)

for epoch in range(N_EPOCHS):
    teacher_forcing_ratio = 0.5  # (optional: decay this over epochs if you want)

    train_loss = train(model, train_loader, optimizer, criterion, CLIP, teacher_forcing_ratio)
    val_loss = evaluate(model, val_loader, criterion)
    bleu = evaluate_bleu(model, val_df, num_samples=5)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | BLEU: {bleu:.4f}")

    if bleu > best_bleu:
        best_bleu = bleu
        save_model(model, "best_model.pt")
        print(" Best model saved")

    early_stopper.step(val_loss)
    if early_stopper.early_stop:
        print(" Early stopping triggered.")
        break

                                                           

Example 1 BLEU: 0.0264
Example 2 BLEU: 0.0067
Example 3 BLEU: 0.0308
Example 4 BLEU: 0.0172
Example 5 BLEU: 0.0074

Avg BLEU over 5 samples: 0.0177
Epoch 1 | Train Loss: 4.3369 | Val Loss: 4.2380 | BLEU: 0.0177
 Best model saved


                                                           

Example 1 BLEU: 0.0432
Example 2 BLEU: 0.0317
Example 3 BLEU: 0.0390
Example 4 BLEU: 0.0122
Example 5 BLEU: 0.0408

Avg BLEU over 5 samples: 0.0334
Epoch 2 | Train Loss: 3.5952 | Val Loss: 3.9785 | BLEU: 0.0334
 Best model saved


                                                           

Example 1 BLEU: 0.0587
Example 2 BLEU: 0.0391
Example 3 BLEU: 0.1107
Example 4 BLEU: 0.0635
Example 5 BLEU: 0.0568

Avg BLEU over 5 samples: 0.0657
Epoch 3 | Train Loss: 3.1286 | Val Loss: 3.8210 | BLEU: 0.0657
 Best model saved


                                                           

Example 1 BLEU: 0.0321
Example 2 BLEU: 0.0554
Example 3 BLEU: 0.0612
Example 4 BLEU: 0.0647
Example 5 BLEU: 0.0820

Avg BLEU over 5 samples: 0.0591
Epoch 4 | Train Loss: 2.7764 | Val Loss: 3.7559 | BLEU: 0.0591


                                                           

Example 1 BLEU: 0.0963
Example 2 BLEU: 0.0752
Example 3 BLEU: 0.0690
Example 4 BLEU: 0.0837
Example 5 BLEU: 0.0873

Avg BLEU over 5 samples: 0.0823
Epoch 5 | Train Loss: 2.4771 | Val Loss: 3.7515 | BLEU: 0.0823
 Best model saved


                                                           

Example 1 BLEU: 0.1114
Example 2 BLEU: 0.1220
Example 3 BLEU: 0.0886
Example 4 BLEU: 0.1045
Example 5 BLEU: 0.2868

Avg BLEU over 5 samples: 0.1427
Epoch 6 | Train Loss: 2.2656 | Val Loss: 3.7614 | BLEU: 0.1427
 Best model saved


                                                           

Example 1 BLEU: 0.1044
Example 2 BLEU: 0.1267
Example 3 BLEU: 0.2625
Example 4 BLEU: 0.1125
Example 5 BLEU: 0.3317

Avg BLEU over 5 samples: 0.1876
Epoch 7 | Train Loss: 2.0743 | Val Loss: 3.7730 | BLEU: 0.1876
 Best model saved


                                                           

Example 1 BLEU: 0.2713
Example 2 BLEU: 0.2021
Example 3 BLEU: 0.0849
Example 4 BLEU: 0.1253
Example 5 BLEU: 0.2917

Avg BLEU over 5 samples: 0.1951
Epoch 8 | Train Loss: 1.9072 | Val Loss: 3.7342 | BLEU: 0.1951
 Best model saved


                                                           

Example 1 BLEU: 0.1876
Example 2 BLEU: 0.0601
Example 3 BLEU: 0.2036
Example 4 BLEU: 0.0796
Example 5 BLEU: 0.2405

Avg BLEU over 5 samples: 0.1543
Epoch 9 | Train Loss: 1.7901 | Val Loss: 3.8436 | BLEU: 0.1543


                                                           

Example 1 BLEU: 0.2551
Example 2 BLEU: 0.1850
Example 3 BLEU: 0.2704
Example 4 BLEU: 0.1152
Example 5 BLEU: 0.3128

Avg BLEU over 5 samples: 0.2277
Epoch 10 | Train Loss: 1.7041 | Val Loss: 3.7451 | BLEU: 0.2277
 Best model saved


                                                           

Example 1 BLEU: 0.3892
Example 2 BLEU: 0.1713
Example 3 BLEU: 0.2396
Example 4 BLEU: 0.3625
Example 5 BLEU: 0.4914

Avg BLEU over 5 samples: 0.3308
Epoch 11 | Train Loss: 1.5646 | Val Loss: 3.8167 | BLEU: 0.3308
 Best model saved


                                                           

Example 1 BLEU: 0.2134
Example 2 BLEU: 0.2949
Example 3 BLEU: 0.3331
Example 4 BLEU: 0.1528
Example 5 BLEU: 0.4336

Avg BLEU over 5 samples: 0.2856
Epoch 12 | Train Loss: 1.4973 | Val Loss: 3.7422 | BLEU: 0.2856


                                                           

Example 1 BLEU: 0.3668
Example 2 BLEU: 0.2618
Example 3 BLEU: 0.1029
Example 4 BLEU: 0.1617
Example 5 BLEU: 0.5012

Avg BLEU over 5 samples: 0.2789
Epoch 13 | Train Loss: 1.4348 | Val Loss: 3.9821 | BLEU: 0.2789
 Early stopping triggered.


In [13]:
load_model(model, "best_model.pt")

# Try inference
test_row = val_df.iloc[0]
src_tensor = torch.tensor(test_row['py_input_ids'], dtype=torch.long)
predicted_tokens = beam_search(model, src_tensor)
print("Predicted:", " ".join(predicted_tokens))
print("Reference:", " ".join(test_row['cpp_tokens']))


  model.load_state_dict(torch.load(path))


Predicted: removeElement ( auto A ) { " , " , mx , mn = [ 0 ] len ( A ) , [ 0 ] len len A ) , [ 0 ] len ( A ) , float ( " inf ) ) i , i in ( enumerate A A ) : mx = max ( mx mx num , ) ) [ i ] i for in range ( len ( A ) - 1 ) : if in [ i ] <= <= [ i + <= <= [ i + <= : return i + 1 }
Reference: auto partitionDisjoint ( auto A ) { rMin , lMax , mx , mn = [ 0 ] * len ( A ) , [ 0 ] * len ( A ) , - float ( " inf " ) , float ( " inf " ) for i , num in enumerate ( A ) : mx = max ( mx , num ) lMax [ i ] = mx for i in range ( len ( A ) - 1 , -1 , -1 ) : mn = min ( mn , A [ i ] ) rMin [ i ] = mn for i in range ( len ( A ) - 1 ) : if lMax [ i ] <= rMin [ i + 1 ] : return i + 1 } 
