In [None]:
# Import necessary libraries
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import os

# Connect to TPU
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)

# Load dataset
df = pd.read_csv("spoc_cleaned_final.csv")

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB_SIZE = 10000
MAX_SEQ_LENGTH = 150

# Initialize tokenizers
tokenizer_pseudo = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='', oov_token="<OOV>")
tokenizer_cpp = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='', oov_token="<OOV>")

# Fit tokenizers
tokenizer_pseudo.fit_on_texts(df["pseudocode"].astype(str))
tokenizer_cpp.fit_on_texts(df["cpp_code_cleaned"].astype(str))

# Convert to sequences
pseudo_seq = tokenizer_pseudo.texts_to_sequences(df["pseudocode"].astype(str))
cpp_seq = tokenizer_cpp.texts_to_sequences(df["cpp_code_cleaned"].astype(str))

# Pad sequences
pseudo_seq = pad_sequences(pseudo_seq, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")
cpp_seq = pad_sequences(cpp_seq, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

# Save Tokenizers
with open("tokenizer_pseudo.pkl", "wb") as f:
    pickle.dump(tokenizer_pseudo, f)

with open("tokenizer_cpp.pkl", "wb") as f:
    pickle.dump(tokenizer_cpp, f)

# Transformer Model
class Transformer(tf.keras.Model):
    def __init__(self, vocab_size, seq_length, embed_dim=256, num_heads=8, ff_dim=512):
        super(Transformer, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_dim, mask_zero=True)
        self.pos_encoding = tf.keras.layers.Embedding(seq_length, embed_dim)
        self.encoder = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(ff_dim, activation='relu'),
            tf.keras.layers.Dense(embed_dim)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.output_layer = tf.keras.layers.Dense(vocab_size)  # No activation (from_logits=True in loss)

    def call(self, inputs):
        x = self.embedding(inputs) + self.pos_encoding(tf.range(inputs.shape[1]))
        attn_output = self.encoder(x, x)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(out1 + ffn_output)
        return self.output_layer(out2)

# Strategy for TPU training
with strategy.scope():
    transformer_model = Transformer(MAX_VOCAB_SIZE, MAX_SEQ_LENGTH)

    # Using Adam with a lower learning rate for stability
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

    # Using CategoricalCrossentropy with from_logits=True
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    transformer_model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])

# Shift labels for training
pseudo_seq_shifted = np.roll(cpp_seq, shift=-1, axis=1)

# Training
transformer_model.fit(pseudo_seq, pseudo_seq_shifted, batch_size=64, epochs=20)

# ✅ Save Model (Corrected)
transformer_model.save("transformer_pseudo_cpp.keras")  # Corrected save format


Epoch 1/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 56ms/step - accuracy: nan - loss: nan
Epoch 2/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: nan - loss: nan
Epoch 3/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: nan - loss: nan
Epoch 4/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: nan - loss: nan
Epoch 5/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: nan - loss: nan
Epoch 6/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: nan - loss: nan
Epoch 7/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: nan - loss: nan
Epoch 8/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: nan - loss: nan
Epoch 9/20
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22

In [34]:
import pandas as pd

df = pd.read_csv("spoc_cleaned_final.csv")
print(df.isnull().sum())  # Check for missing values


pseudocode          0
cpp_code_cleaned    0
cpp_errors          0
dtype: int64


In [35]:
example_cpp = "int main() { return 0; }"
tokenized_cpp = tokenizer_cpp.texts_to_sequences([example_cpp])
print("Tokenized C++:", tokenized_cpp)


Tokenized C++: [[40, 18, 32, 10, 11, 1994]]


In [36]:
example_cpp = "int main() { return 0; }"
tokenized_cpp = tokenizer_cpp.texts_to_sequences([example_cpp])
print("Tokenized C++:", tokenized_cpp)


Tokenized C++: [[40, 18, 32, 10, 11, 1994]]


In [37]:
print("Pseudocode Vocabulary Size:", len(tokenizer_pseudo_to_cpp.word_index))
print("C++ Vocabulary Size:", len(tokenizer_cpp.word_index))


Pseudocode Vocabulary Size: 26325
C++ Vocabulary Size: 57604


In [38]:
!rm -rf transformer_pseudo_cpp.keras tokenizer_cpp.pkl tokenizer_pseudo_to_cpp.pkl


In [39]:
generate_cpp("INPUT X", tokenizer_pseudo_to_cpp, tokenizer_cpp, model)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step


'1][i] {\ntemp.clear();\nfor "?" b.size() "june";\nm[6] {\na[x] a[i]) nr vec(n str[i];\nfor endl;\nelse x2);\ncout "|o.o.o.o.#.#.#.#.#.#.#.|.|" arr[1]) carry x;\na j][y has arr[i]);\nif (a[2 ((k limit; 7;\nbreak;\nif "?" break; a[3] a[3] "?" "?" check(string (p1 abs(d w1 {\na[x] (a[ii] "?" 0;\ndouble abs(y);\nif str[i];\nfor n[j]) a[3] a[3] a[i]) int_max; a[i]) 100;\nt ((m (p) s.size();\nstring "?" a[3] sieve() n);\nif sumz carry w1 (all.size() nr 1e9) s;\nmap<string, a[0][1] s.size();\nstring }\nmaxx a[3] 0;\ndouble a[i]) sieve() int_max; max1; x;\nysum j][y x[3]) int_min;\nfor "?" (s) (a[0][2] limit; isdigit(s[i]) 1e9) a[3] a[3] -r 3;\narr[5] b[100];\ncin "?" a[0][1] "?" {\na[x] nr (s) "?" {\nbegin "?" n;\nint 200000) x.size(); x;\nysum (one (s) _s "o-|ooo-o" max(s[i], prime[num++] a[0][1] j][y s.size();\nstring w1 {\na[x] sieve() first, s;\nmap<string, (s1[i rr ch;\nfor b])]++;\nfor (a[ii] "?" die_roll[6] {\na[x] p;\ncin x;\na a[3] isdigit(s[i]) w1 carry ((a1 a[3] xx; (input[i] 1e9)

In [40]:
pseudo_code_sample = "for i = 1 to 10 print i"
cpp_code_sample = "for(int i = 1; i <= 10; i++) cout << i << endl;"

pseudo_tokens = tokenizer_pseudo_to_cpp.texts_to_sequences([pseudo_code_sample])
cpp_tokens = tokenizer_cpp.texts_to_sequences([cpp_code_sample])

print("Pseudocode Tokens:", pseudo_tokens)
print("C++ Tokens:", cpp_tokens)


Pseudocode Tokens: [[14, 10, 2, 15, 3, 89, 9, 10]]
C++ Tokens: [[4, 1, 28, 4, 25, 443, 13, 64, 2, 4, 2, 195]]


In [41]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()


NameError: name 'history' is not defined

In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, MultiHeadAttention, LayerNormalization, Dense
from tensorflow.keras.models import Model

class Transformer(Model):
    def __init__(self, vocab_size, seq_length, embed_dim=256, num_heads=8, ff_dim=512, **kwargs):
        super(Transformer, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.seq_length = seq_length
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

        self.embedding = Embedding(vocab_size, embed_dim, mask_zero=True)
        self.pos_encoding = Embedding(seq_length, embed_dim)
        self.encoder = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.output_layer = Dense(vocab_size, activation="softmax")

    def call(self, inputs):
        x = self.embedding(inputs) + self.pos_encoding(tf.range(inputs.shape[1]))
        attn_output = self.encoder(x, x, x)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(out1 + ffn_output)
        return self.output_layer(out2)

    def get_config(self):
        config = super(Transformer, self).get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "seq_length": self.seq_length,
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [15]:
from tensorflow.keras.models import load_model

# Load with explicit custom_objects
model = load_model("transformer_pseudo_cpp.keras", custom_objects={"Transformer": Transformer})


In [17]:
import pickle
import tensorflow as tf

# Load the pseudocode-to-C++ tokenizer
with open("tokenizer_pseudo_to_cpp.pkl", "rb") as file:
    tokenizer_pseudo_to_cpp = pickle.load(file)

# Load the C++ tokenizer (for decoding output)
with open("tokenizer_cpp.pkl", "rb") as file:
    tokenizer_cpp = pickle.load(file)



In [18]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def generate_cpp(pseudocode, tokenizer_input, tokenizer_output, model, max_length=150):
    """ Generates C++ code from input pseudocode using the trained Transformer model. """

    # Tokenize and pad the input pseudocode
    input_seq = tokenizer_input.texts_to_sequences([pseudocode])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding="post")

    # Predict the output sequence
    prediction = model.predict(input_seq)

    # Convert tokenized output back to text
    predicted_tokens = np.argmax(prediction, axis=-1)[0]
    generated_code = tokenizer_output.sequences_to_texts([predicted_tokens])[0]

    return generated_code


In [32]:
pseudocode_example = "BEGIN \n INPUT X \n IF X > 10 THEN PRINT 'LARGE' ELSE PRINT 'SMALL' \n END"

generated_cpp = generate_cpp(pseudocode_example, tokenizer_pseudo_to_cpp, tokenizer_cpp, model)
print("Generated C++ Code:\n", generated_cpp)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
Generated C++ Code:
 current q;
int arr[b] a[105];
int s.size(), m;
m--;
for 0)
;
cout &in, y;
b bfs(int cont2 l;
ans int(m b])]++;
for && calc(int rr has arr[i]);
if (a[2 ((k limit; 7;
break;
if "?" break; a[3] a[3] "?" "?" check(string (p1 m; w1 {
a[x] (a[ii] "?" 0;
double abs(y);
if str[i];
for n[j]) a[3] a[3] a[i]) int_max; a[i]) 100;
t ((m (p) s.size();
string "?" a[3] sieve() n);
if sumz carry w1 (all.size() nr 1e9) s;
map<string, a[0][1] s.size();
string }
maxx a[3] 0;
double a[i]) sieve() int_max; max1; x;
ysum j][y x[3]) int_min;
for "?" (s) (a[0][2] limit; isdigit(s[i]) 1e9) a[3] a[3] -r 3;
arr[5] b[100];
cin "?" a[0][1] "?" {
a[x] nr (s) "?" {
begin "?" n;
int 200000) x.size(); x;
ysum (one (s) _s "o-|ooo-o" max(s[i], prime[num++] a[0][1] j][y s.size();
string w1 {
a[x] sieve() first, s;
map<string, (s1[i rr ch;
for b])]++;
for (a[ii] "?" die_roll[6] {
a[x] p;
cin x;
a a[3] isdigit(s[i]) w1 carry ((a1 a

In [33]:
print("Pseudocode Tokenized Example:", tokenizer_pseudo_to_cpp.texts_to_sequences(["INPUT X"]))
print("C++ Tokenized Example:", tokenizer_cpp.texts_to_sequences(["int x;"]))


Pseudocode Tokenized Example: [[80, 26]]
C++ Tokenized Example: [[40, 269]]
