**IMPORTING LIBRARIES**

In [4]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import nltk

# Download the 'punkt' tokenizer data
nltk.download('punkt')

# If 'punkt_tab' is still missing, try:
nltk.download('punkt_tab')


# Download punkt tokenizer
nltk.download('punkt')


# Load dataset
file_path = r"/content/spoc-train.csv" # Change to your dataset's path
df = pd.read_csv(file_path)

# Display dataset information
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                             text                 code  workerid probid  \
0                             NaN         int main() {         1     3A   
1                 create string s            string s;         1     3A   
2  create integers x1, y1, x2, y2  int x1, y1, x2, y2;         1     3A   
3                          read s            cin >> s;         1     3A   
4             set x1 to s[0] - 96      x1 = s[0] - 96;         1     3A   

      subid  line  indent  
0  41470897     0       0  
1  41470897     1       1  
2  41470897     2       1  
3  41470897     3       1  
4  41470897     4       1  


**PREPROCESSING**

In [5]:
from nltk.tokenize import word_tokenize

df["text"] = df["text"].astype(str)
df["text_tokens"] = df["text"].apply(word_tokenize)

In [3]:
df["text"] = df["text"].fillna("")  # Replace NaN with empty strings
df["text_tokens"] = df["text"].apply(word_tokenize)

In [6]:
df["text_tokens"] = df["text"].apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

In [7]:
print(df["text"].dtype)  # Check the column's data type
print(df["text"].isna().sum())  # Count missing values
print(df[df["text"].apply(lambda x: not isinstance(x, str))])  # Show non-string values

object
0
Empty DataFrame
Columns: [text, code, workerid, probid, subid, line, indent, text_tokens]
Index: []


In [8]:
import nltk
from nltk.tokenize import word_tokenize

# Download tokenizer if not available
nltk.download('punkt')

# Tokenizing pseudocode and code
df["text_tokens"] = df["text"].apply(word_tokenize)
df["code_tokens"] = df["code"].apply(word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**TOKENIZATION**

In [10]:
# Print samples from index 1-10
print("Samples from index 1-10:")
for i in range(1, 11):
    print(f"Index {i}:")
    print("Tokenized Pseudocode:", df["text_tokens"].iloc[i])
    print("Tokenized C++ Code:", df["code_tokens"].iloc[i])
    print("-" * 50)

# Print samples from index 20-30
print("\nSamples from index 20-30:")
for i in range(20, 31):
    print(f"Index {i}:")
    print("Tokenized Pseudocode:", df["text_tokens"].iloc[i])
    print("Tokenized C++ Code:", df["code_tokens"].iloc[i])
    print("-" * 50)

Samples from index 1-10:
Index 1:
Tokenized Pseudocode: ['create', 'string', 's']
Tokenized C++ Code: ['string', 's', ';']
--------------------------------------------------
Index 2:
Tokenized Pseudocode: ['create', 'integers', 'x1', ',', 'y1', ',', 'x2', ',', 'y2']
Tokenized C++ Code: ['int', 'x1', ',', 'y1', ',', 'x2', ',', 'y2', ';']
--------------------------------------------------
Index 3:
Tokenized Pseudocode: ['read', 's']
Tokenized C++ Code: ['cin', '>', '>', 's', ';']
--------------------------------------------------
Index 4:
Tokenized Pseudocode: ['set', 'x1', 'to', 's', '[', '0', ']', '-', '96']
Tokenized C++ Code: ['x1', '=', 's', '[', '0', ']', '-', '96', ';']
--------------------------------------------------
Index 5:
Tokenized Pseudocode: ['set', 'y1', 'to', 's', '[', '1', ']', '-', "'", '0', "'"]
Tokenized C++ Code: ['y1', '=', 's', '[', '1', ']', '-', "'", '0', "'", ';']
--------------------------------------------------
Index 6:
Tokenized Pseudocode: ['read', 's']
T

In [11]:
# Save tokenized pseudocode and C++ code to CSV
output_file = "tokenized_spoc.csv"
df[["text_tokens", "code_tokens"]].to_csv(output_file, index=False)

print(f"Tokenized data saved to {output_file}")

Tokenized data saved to tokenized_spoc.csv


In [12]:
# Add start and end tokens to tokenized C++ code
df["code_tokens"] = df["code_tokens"].apply(lambda tokens: ["<start>"] + tokens + ["<end>"])

# Save updated tokenized data to CSV
output_file = "tokenized_spoc_with_tokens.csv"
df[["text_tokens", "code_tokens"]].to_csv(output_file, index=False)

print(f"Updated tokenized data saved to {output_file}")

Updated tokenized data saved to tokenized_spoc_with_tokens.csv


In [13]:
# Make "text_tokens" and "code_tokens" length same by padding with "<pad>"
max_len = max(df["text_tokens"].apply(len).max(), df["code_tokens"].apply(len).max())

df["text_tokens"] = df["text_tokens"].apply(lambda tokens: tokens + ["<pad>"] * (max_len - len(tokens)))
df["code_tokens"] = df["code_tokens"].apply(lambda tokens: tokens + ["<pad>"] * (max_len - len(tokens)))

# Save padded tokenized data to CSV
output_file = "tokenized_spoc_padded.csv"
df[["text_tokens", "code_tokens"]].to_csv(output_file, index=False)

print(f"Padded tokenized data saved to {output_file}")

Padded tokenized data saved to tokenized_spoc_padded.csv


In [14]:
import json

# Define special tokens with fixed indices
vocab = {
    "<unk>": 0,
    "<pad>": 1,
    "<start>": 2,
    "<end>": 3
}

# Assign indices to other tokens
for column in ["text_tokens", "code_tokens"]:
    for tokens in df[column]:
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)

# Save vocabulary to JSON
vocab_file = "vocabulary.json"
with open(vocab_file, "w") as f:
    json.dump(vocab, f, indent=4)

print(f"Vocabulary saved to {vocab_file}")

Vocabulary saved to vocabulary.json


In [15]:
# Load vocabulary
with open("vocabulary.json", "r") as f:
    vocab = json.load(f)

# Load tokenized data
df = pd.read_csv("tokenized_spoc_padded.csv")

# Convert string tokens to lists
df["text_tokens"] = df["text_tokens"].apply(eval)
df["code_tokens"] = df["code_tokens"].apply(eval)

# Convert tokens to sequences using vocabulary
df["text_sequences"] = df["text_tokens"].apply(lambda tokens: [vocab.get(token, vocab["<unk>"]) for token in tokens])
df["code_sequences"] = df["code_tokens"].apply(lambda tokens: [vocab.get(token, vocab["<unk>"]) for token in tokens])

# Save sequences to CSV
output_file = "tokenized_sequences.csv"
df[["text_sequences", "code_sequences"]].to_csv(output_file, index=False)

print(f"Tokenized sequences saved to {output_file}")

Tokenized sequences saved to tokenized_sequences.csv


In [16]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch
import ast
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

class DataLoad(Dataset):
    def __init__(self, file_path):
        df = pd.read_csv(file_path)
        self.inputs = [ast.literal_eval(x) for x in df['text_sequences']]
        self.outputs = [ast.literal_eval(x) for x in df['code_sequences']]

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.inputs[idx], dtype=torch.int64)
        output_tensor = torch.tensor(self.outputs[idx], dtype=torch.int64)
        return input_tensor, output_tensor

def Add_Pad(batch):
    inputs, outputs = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    outputs = pad_sequence(outputs, batch_first=True, padding_value=0)
    return inputs, outputs

# Load dataset and dataloader
dataset = DataLoad("tokenized_sequences.csv")
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=Add_Pad)

# Iterate with progress bar
data_iter = iter(dataloader)
for batch in tqdm(dataloader, desc="Loading Batches"):
    features, labels = batch  # Get a batch of data
    break  # Remove this if you want to iterate over all batches

print("Batch loaded successfully!")

Loading Batches:   0%|          | 0/3846 [00:00<?, ?it/s]

Batch loaded successfully!





In [17]:
print(features)
print(labels)

tensor([[ 73,   1,   1,  ...,   1,   1,   1],
        [ 77, 686,  16,  ...,   1,   1,   1],
        [ 44, 106,  92,  ...,   1,   1,   1],
        ...,
        [  4,   1,   1,  ...,   1,   1,   1],
        [ 41,  78,   1,  ...,   1,   1,   1],
        [  4,   1,   1,  ...,   1,   1,   1]])
tensor([[   2,  328,   73,  ...,    1,    1,    1],
        [   2,   77,   50,  ...,    1,    1,    1],
        [   2,  106, 2367,  ...,    1,    1,    1],
        ...,
        [   2,  328,    3,  ...,    1,    1,    1],
        [   2, 2823,  208,  ...,    1,    1,    1],
        [   2,  328,    3,  ...,    1,    1,    1]])


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# Transformer Hyperparameters
class Config:
    vocab_size = 12006  # Adjust based on vocabulary.json
    max_length = 100  # Adjust based on sequence length
    embed_dim = 256
    num_heads = 8
    num_layers =2
    feedforward_dim = 512
    dropout = 0.1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = Config()

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=100):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # Shape: (1, max_len, embed_dim)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

# Transformer Model
class PseudoCodeTransformer(nn.Module):
    def __init__(self, config):
        super(PseudoCodeTransformer, self).__init__()
        self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)
        self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)

        self.transformer = nn.Transformer(
            d_model=config.embed_dim,
            nhead=config.num_heads,
            num_encoder_layers=config.num_layers,
            num_decoder_layers=config.num_layers,
            dim_feedforward=config.feedforward_dim,
            dropout=config.dropout
        )

        self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)
        self.dropout = nn.Dropout(config.dropout)

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(config.device)

    def forward(self, src, tgt):
        src_emb = self.embedding(src) * math.sqrt(config.embed_dim)
        tgt_emb = self.embedding(tgt) * math.sqrt(config.embed_dim)

        src_emb = self.positional_encoding(src_emb)
        tgt_emb = self.positional_encoding(tgt_emb)

        src_mask = self.generate_square_subsequent_mask(src.size(1))
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))

        out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),
                               src_mask=src_mask, tgt_mask=tgt_mask)

        out = self.fc_out(out.permute(1, 0, 2))  # Convert back to batch-first
        return out

# Initialize Model
model = PseudoCodeTransformer(config).to(config.device)
print("Model initialized successfully!")



Model initialized successfully!


In [19]:
def translate(model, pseudocode_tokens, vocab, device, max_length=50):
        model.eval()

        # Convert pseudocode tokens to numerical indices
        input_ids = [vocab.get(token, vocab["<unk>"]) for token in pseudocode_tokens]
        input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

        # Start token for generation
        output_ids = [vocab["<start>"]]

        for _ in range(max_length):
            output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)

            # Get model predictions
            with torch.no_grad():
                predictions = model(input_tensor, output_tensor)

            # Select the most probable token
            next_token_id = predictions.argmax(dim=-1)[:, -1].item()
            output_ids.append(next_token_id)

            # Stop if end token is generated
            if next_token_id == vocab["<end>"]:
                break

        # Convert token indices back to words
        id_to_token = {idx: token for token, idx in vocab.items()}
        translated_code = [id_to_token.get(idx, "<unk>") for idx in output_ids[1:]]  # Exclude <start> token

        return " ".join(translated_code)

In [20]:
import json

# Load vocabulary
with open("vocabulary.json", "r") as f:
    vocab = json.load(f)

# Ensure vocab is a dictionary
print(f"✅ Vocabulary loaded with {len(vocab)} tokens")

✅ Vocabulary loaded with 12006 tokens


**PSEUDOCODE TO C++**

In [21]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
import os

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔹 Using device: {device}")

# Move model to device
model.to(device)

# Loss Function & Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=1)  # Ignore padding token
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)

# Create directory to save models
os.makedirs("checkpoints", exist_ok=True)

# Training Loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)  # Move batch to GPU

        tgt_input = tgt[:, :-1]  # Remove <end> token
        tgt_output = tgt[:, 1:]  # Shifted version

        optimizer.zero_grad()
        output = model(src, tgt_input)

        loss = criterion(output.view(-1, config.vocab_size), tgt_output.contiguous().view(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Save Model Checkpoint
    torch.save(model.state_dict(), f"checkpoints/p2c{epoch+1}.pth")
    print(f"✅ Model saved: checkpoints/p2c{epoch+1}.pth")

    # Print Example Prediction
    model.eval()
    example_pseudocode = ["create", "integer", "x"]
    translated_code = translate(model, example_pseudocode, vocab, device)
    print(f"🔹 Example Prediction (Pseudocode → C++): {translated_code}\n")

🔹 Using device: cuda


Epoch 1/1: 100%|██████████| 3846/3846 [06:17<00:00, 10.20it/s, loss=0.248]


Epoch [1/1], Loss: 0.9357
✅ Model saved: checkpoints/p2c1.pth
🔹 Example Prediction (Pseudocode → C++): int x , int x , int x , int x , int x , int x , int x int x , int x , int x , int x , int x , int x , int x , int x int x , int x , int



In [30]:
model.eval()

PseudoCodeTransformer(
  (embedding): Embedding(12006, 256)
  (positional_encoding): PositionalEncoding()
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleLi

In [31]:
# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔹 Using device: {device}")

# Move model to device
model.to(device)
model = PseudoCodeTransformer(config).to(device)
model.load_state_dict(torch.load("/content/checkpoints/p2c1.pth", map_location=device))
model.eval()

# Run translation on example pseudocode
example_pseudocode = ["for", "i", "=", "0", "to", "size", "of", "ans", "exclusive", ",", "print", "ans", "[", "i", "]", "print", "newline"]
translated_code = translate(model, example_pseudocode, vocab, device)

print(f"🔹 Example Prediction (Pseudocode → C++): {translated_code}\n")

🔹 Using device: cuda
🔹 Example Prediction (Pseudocode → C++): for ( int i = 0 ; i < ans.size ( ( ) ; i++ ) { cout < < ans ans ans ( ans ( ans ( ans ( ans ) ) ; } <end>



  model.load_state_dict(torch.load("/content/checkpoints/p2c1.pth", map_location=device))


**C++ CODE TO PSEUDOCODE**

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import pandas as pd
import ast
import json
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import os

# Transformer Hyperparameters
class Config:
    vocab_size = 12006  # Adjust based on vocabulary.json
    max_length = 100
    embed_dim = 256
    num_heads = 8
    num_layers = 2
    feedforward_dim = 512
    dropout = 0.1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = Config()

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=100):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

# Transformer Model
class CPPtoPseudoTransformer(nn.Module):
    def __init__(self, config):
        super(CPPtoPseudoTransformer, self).__init__()
        self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)
        self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)

        self.transformer = nn.Transformer(
            d_model=config.embed_dim,
            nhead=config.num_heads,
            num_encoder_layers=config.num_layers,
            num_decoder_layers=config.num_layers,
            dim_feedforward=config.feedforward_dim,
            dropout=config.dropout
        )

        self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)
        self.dropout = nn.Dropout(config.dropout)

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(config.device)

    def forward(self, src, tgt):
        src_emb = self.embedding(src) * math.sqrt(config.embed_dim)
        tgt_emb = self.embedding(tgt) * math.sqrt(config.embed_dim)

        src_emb = self.positional_encoding(src_emb)
        tgt_emb = self.positional_encoding(tgt_emb)

        src_mask = self.generate_square_subsequent_mask(src.size(1))
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))

        out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),
                               src_mask=src_mask, tgt_mask=tgt_mask)

        out = self.fc_out(out.permute(1, 0, 2))
        return out

# Initialize Model
model = CPPtoPseudoTransformer(config).to(config.device)
print("🚀 C++ → Pseudocode Model initialized!")

🚀 C++ → Pseudocode Model initialized!


In [33]:
# Load Vocabulary
with open("vocabulary.json", "r") as f:
    vocab = json.load(f)

print(f"✅ Vocabulary loaded with {len(vocab)} tokens")

✅ Vocabulary loaded with 12006 tokens


In [34]:
def translate2(model, cpp_tokens, vocab, device, max_length=50):
    model.eval()

    # Convert C++ tokens to numerical indices
    input_ids = [vocab.get(token, vocab["<unk>"]) for token in cpp_tokens]
    input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension

    output_ids = [vocab["<start>"]]

    for _ in range(max_length):
        output_tensor = torch.tensor(output_ids, dtype=torch.long).unsqueeze(0).to(device)

        # Get model predictions
        with torch.no_grad():
            predictions = model(input_tensor, output_tensor)

        # Select the most probable token
        next_token_id = predictions.argmax(dim=-1)[:, -1].item()

        if next_token_id == vocab["<pad>"]:  # Ignore <pad> tokens
            continue

        output_ids.append(next_token_id)

        if next_token_id == vocab["<end>"]:  # Stop if <end> is generated
            break

    # Convert token indices back to words
    id_to_token = {idx: token for token, idx in vocab.items()}
    translated_pseudocode = [id_to_token.get(idx, "<unk>") for idx in output_ids[1:]]  # Exclude <start>

    return " ".join(translated_pseudocode)

In [35]:
# Dataset Class
class CPPToPseudoDataset(Dataset):
    def __init__(self, file_path):
        df = pd.read_csv(file_path)
        self.inputs = [ast.literal_eval(x) for x in df['code_sequences']]
        self.outputs = [ast.literal_eval(x) for x in df['text_sequences']]

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.inputs[idx], dtype=torch.int64)
        output_tensor = torch.tensor([vocab["<start>"]] + self.outputs[idx] + [vocab["<end>"]], dtype=torch.int64)
        return input_tensor, output_tensor

# Padding Function
def Add_Pad(batch):
    inputs, outputs = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    outputs = pad_sequence(outputs, batch_first=True, padding_value=0)
    return inputs, outputs

# Load Dataset
dataset = CPPToPseudoDataset("tokenized_sequences.csv")
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=Add_Pad)

print(f"✅ Loaded {len(dataset)} examples for training")

# Training Configuration
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)

# Create directory to save models
os.makedirs("checkpoints", exist_ok=True)

✅ Loaded 246086 examples for training


In [36]:
# Training Loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        src, tgt = batch
        src, tgt = src.to(config.device), tgt.to(config.device)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        optimizer.zero_grad()
        output = model(src, tgt_input)

        loss = criterion(output.view(-1, config.vocab_size), tgt_output.contiguous().view(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Save Model Checkpoint
    torch.save(model.state_dict(), f"checkpoints/c2p{epoch+1}.pth")
    print(f"✅ Model saved: checkpoints/c2p{epoch+1}.pth")

    # Print Example Prediction
    model.eval()
    example_cpp = ["int", "main", "(", ")", "{", "return", "0", ";", "}"]
    translated_pseudocode = translate2(model, example_cpp, vocab, config.device)
    print(f"🔹 Example Prediction (C++ → Pseudocode): {translated_pseudocode}\n")

Epoch 1/1: 100%|██████████| 3846/3846 [06:27<00:00,  9.93it/s, loss=0.358]


Epoch [1/1], Loss: 1.4150
✅ Model saved: checkpoints/c2p1.pth
🔹 Example Prediction (C++ → Pseudocode): integer function isprime with integer argument integer argument size of size of size of size of size of size of a integer as integer as integer casted size of size of size of size of size of a integer as integer casted size of size of size of size of



In [37]:
# Load the trained model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔹 Using device: {device}")

# Move model to device
model.to(device)

model = PseudoCodeTransformer(config).to(device)
model.load_state_dict(torch.load("/content/checkpoints/c2p1.pth", map_location=device))
model.eval()

example_cpp = ["int", "a","=", "10",";"]
translated_pseudocode = translate2(model, example_cpp, vocab, config.device)
print(f"🔹 Example Prediction (C++ → Pseudocode): {translated_pseudocode}\n")

🔹 Using device: cuda


  model.load_state_dict(torch.load("/content/checkpoints/c2p1.pth", map_location=device))


🔹 Example Prediction (C++ → Pseudocode): a = integer , a = 10 = 10 = 10 = 10 integer with a = 10 = 10 = 10 = 10 = 10 = 10 = 10 = 10 integer with a = 10 = 10 = 10 = 10 = 10 = 10 = 10 integer

