In [17]:
!pip install evaluate codebleu datasets transformers peft accelerate torch scikit-learn gradio



In [18]:
import pandas as pd
import re
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import evaluate
from sklearn.model_selection import train_test_split
from codebleu import calc_codebleu

In [19]:
tsv_path = '/content/spoc-train.tsv'
df = pd.read_csv(tsv_path, sep='\t', dtype=str).fillna('')

def reconstruct(g):
    pseudo = '\n'.join(g['text'].astype(str).tolist()).strip()
    code = '\n'.join(g['code'].astype(str).tolist()).strip()
    return pd.Series({'pseudo': pseudo, 'code': code})

pairs = df.groupby(['probid','subid'], group_keys=False).apply(reconstruct).reset_index()
print(f"Total pairs: {len(pairs)}")
print(pairs.head(2))

Total pairs: 14548
  probid     subid                                             pseudo  \
0  1000A  41887560  create a map from strings to integers mp\n\ncr...   
1  1000A  41980279  INF = const int with INF = 0x3f3f3f3f\n\ni, j,...   

                                                code  
0  map<string, int> mp;\nint main() {\nint n, sum...  
1  const int INF = 0x3f3f3f3f;\nint main() {\nint...  


  pairs = df.groupby(['probid','subid'], group_keys=False).apply(reconstruct).reset_index()


In [20]:
def cpp_to_python_simple(cpp_code):
    try:
        cpp_code = re.sub(r'#include\s*<[^>]+>', '', cpp_code)
        cpp_code = re.sub(r'using\s+namespace\s+std\s*;', '', cpp_code)
        cpp_code = re.sub(r'int\s+main\s*\(\s*\)\s*\{', 'def main():', cpp_code)
        cpp_code = re.sub(r'return\s+0\s*;', 'return 0', cpp_code)
        cpp_code = re.sub(r'}\s*$', '', cpp_code)
        cpp_code = re.sub(r'cin\s*>>\s*([^;]+);', r'\1 = input()', cpp_code)
        cpp_code = re.sub(r'cout\s*<<\s*([^;]+);', r'print(\1)', cpp_code)
        cpp_code = re.sub(r'endl', r'', cpp_code)
        cpp_code = re.sub(r'//', r'#', cpp_code)
        cpp_code = re.sub(r';$', '', cpp_code, flags=re.MULTILINE)
        return cpp_code.strip()
    except:
        return "print('Hello World')"

print("APPLYING C++ TO PYTHON CONVERSION")
pairs['python_code'] = pairs['code'].apply(cpp_to_python_simple)

for i in range(2):
    print(f"Sample {i}:")
    print(f"Pseudocode:\n{pairs.iloc[i]['pseudo'][:200]}...")
    print(f"Python:\n{pairs.iloc[i]['python_code'][:200]}...")

train_df, val_df = train_test_split(pairs, test_size=0.1, random_state=42)
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

APPLYING C++ TO PYTHON CONVERSION
Sample 0:
Pseudocode:
create a map from strings to integers mp

create new integers n and sum with sum = 0
create new string variable s
read from the input to n
for i from 1 to n inclusive, read standard input to s and inc...
Python:
map<string, int> mp
def main():
int n, sum = 0
string s
n = input()
for (int i = 1; i <= n; i++) s, mp[s]++ = input()
for (int i = 1; i <= n; i++) {
s = input()
if (mp[s])
mp[s]--
else
sum++
}
print(s...
Sample 1:
Pseudocode:
INF = const int with INF = 0x3f3f3f3f

i, j, k = int
n, m = int
s, ss = string array of size 105 each
read n
read n values into s
read n values into ss
for i = 0 to n
for j = 0 to n
if s[i] is ss[j] a...
Python:
const int INF = 0x3f3f3f3f
def main():
int i, j, k
int n, m
string s[105], ss[105]
n = input()
for (i = 0; i < n; i++) s[i] = input()
for (i = 0; i < n; i++) ss[i] = input()
for (i = 0; i < n; i++) {
...
Training samples: 13093
Validation samples: 1455


In [21]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

special_tokens = ['<|pseudo|>', '<|python|>', '<|end|>']
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

print(f"Special tokens: {special_tokens}")

Special tokens: ['<|pseudo|>', '<|python|>', '<|end|>']


In [22]:
def format_sample_fixed(example):
    text = f"<|pseudo|>{example['pseudo']}<|python|>{example['python_code']}<|end|>"
    encoded = tokenizer(
        text,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors=None
    )
    encoded["labels"] = encoded["input_ids"].copy()
    return encoded

train_dataset = Dataset.from_pandas(train_df[['pseudo','python_code']]).map(
    format_sample_fixed,
    remove_columns=['pseudo','python_code']
)
val_dataset = Dataset.from_pandas(val_df[['pseudo','python_code']]).map(
    format_sample_fixed,
    remove_columns=['pseudo','python_code']
)

print(f"Train dataset: {len(train_dataset)}")
print(f"Val dataset: {len(val_dataset)}")

Map:   0%|          | 0/13093 [00:00<?, ? examples/s]

Map:   0%|          | 0/1455 [00:00<?, ? examples/s]

Train dataset: 13093
Val dataset: 1455


In [23]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"],
)

model = get_peft_model(model, lora_config)
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Trainable parameters: 1,179,648




In [27]:
def simple_collator(features):
    batch = {
        'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in features]),
        'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in features]),
        'labels': torch.stack([torch.tensor(f['labels']) for f in features])
    }
    return batch

training_args = TrainingArguments(
    output_dir="./pseudo-to-python-model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-4,
    logging_steps=20,
    save_steps=800,
    eval_steps=800,
    eval_strategy="steps",
    save_strategy="steps",
    report_to=[],
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Device: {device}")

Device: cuda


In [28]:
class LoggingTrainer(Trainer):
    def on_log(self, args, state, control, logs=None, **kwargs):
        super().on_log(args, state, control, logs, **kwargs)
        if logs and 'loss' in logs:
            print(f"Step {state.global_step}: Loss = {logs['loss']:.4f}")
        if logs and 'eval_loss' in logs:
            print(f"Step {state.global_step}: Eval Loss = {logs['eval_loss']:.4f}")

trainer = LoggingTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=simple_collator,
    tokenizer=tokenizer,
)

print("STARTING TRAINING")
train_result = trainer.train()

print("TRAINING COMPLETED")
trainer.save_model("./pseudo-to-python-final")
tokenizer.save_pretrained("./pseudo-to-python-final")

final_metrics = train_result.metrics
print(f"Final training loss: {final_metrics.get('train_loss', 'N/A')}")

eval_metrics = trainer.evaluate()
print("FINAL EVALUATION:")
for key, value in eval_metrics.items():
    print(f"{key}: {value}")

  trainer = LoggingTrainer(


STARTING TRAINING


Step,Training Loss,Validation Loss
800,1.0455,0.944026
1600,1.0462,0.871813
2400,0.9205,0.831012
3200,0.8036,0.808654
4000,0.8685,0.786103
4800,0.8838,0.772865
5600,0.8871,0.761028
6400,0.7634,0.750383
7200,0.8177,0.740293
8000,0.7225,0.734315




TRAINING COMPLETED




Final training loss: 0.8712590884055462


FINAL EVALUATION:
eval_loss: 0.724769115447998
eval_runtime: 19.0023
eval_samples_per_second: 76.57
eval_steps_per_second: 19.156
epoch: 3.0


In [29]:
@torch.no_grad()
def generate_python_from_pseudo(pseudocode):
    input_text = f"<|pseudo|>{pseudocode}<|python|>"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        num_beams=3,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    if "<|python|>" in generated_text:
        parts = generated_text.split("<|python|>", 1)
        if len(parts) > 1:
            python_code = parts[1]
            if "<|end|>" in python_code:
                python_code = python_code.split("<|end|>")[0]
            return python_code.strip()

    return generated_text.replace(input_text, "").strip()

print("TESTING GENERATION:")
test_examples = [
    "print numbers from 1 to 10",
    "calculate sum of two numbers",
    "find maximum number in list"
]

for i, example in enumerate(test_examples):
    generated = generate_python_from_pseudo(example)
    print(f"Example {i+1}:")
    print(f"Input: {example}")
    print(f"Output: {generated}")
    print()

TESTING GENERATION:
Example 1:
Input: print numbers from 1 to 10
Output: def main():
print(number(1) << )
return 0

<|endoftext|>

Example 2:
Input: calculate sum of two numbers
Output: def main():
int sum(int a, int b) {
return a > b ? a : b
}
long long int calc(double b, double c)
print(calc * 2 << "\n")

<|endoftext|>

Example 3:
Input: find maximum number in list
Output: def main():
list<int> find(maxn)
print(find(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) << )
return 0
<|endoftext|>



In [30]:
import gradio as gr
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from peft import PeftModel, PeftConfig

class PseudocodeToPython:
    def __init__(self, model_path="./pseudo-to-python-final"):
        try:
            config = PeftConfig.from_pretrained(model_path)
            self.tokenizer = GPT2Tokenizer.from_pretrained(config.base_model_name_or_path)
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.add_special_tokens({'additional_special_tokens': ['<|pseudo|>', '<|python|>', '<|end|>']})

            base_model = GPT2LMHeadModel.from_pretrained(config.base_model_name_or_path)
            base_model.resize_token_embeddings(len(self.tokenizer))

            self.model = PeftModel.from_pretrained(base_model, model_path)
            self.model.eval()

            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            self.model = self.model.to(self.device)

            print("Model loaded successfully")

        except Exception as e:
            print(f"Error: {e}")
            self.setup_fallback()

    def setup_fallback(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')
        self.model.eval()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)

    def generate(self, pseudocode):
        input_text = f"<|pseudo|>{pseudocode}<|python|>"
        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(self.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=256,
                num_beams=3,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                early_stopping=True
            )

        full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=False)

        if "<|python|>" in full_output:
            python_code = full_output.split("<|python|>")[1].strip()
            if "<|end|>" in python_code:
                python_code = python_code.split("<|end|>")[0].strip()
            return python_code

        return full_output.replace(input_text, "").strip()

def create_interface():
    generator = PseudocodeToPython()

    def generate_code(pseudocode):
        if not pseudocode.strip():
            return "Please enter pseudocode"
        try:
            return generator.generate(pseudocode)
        except Exception as e:
            return f"Error: {str(e)}"

    with gr.Blocks(title="Pseudocode to Python") as interface:
        gr.Markdown("# Pseudocode to Python Generator")

        with gr.Row():
            with gr.Column():
                input_text = gr.Textbox(
                    label="Pseudocode Input",
                    placeholder="Enter your pseudocode here...",
                    lines=4
                )
                generate_btn = gr.Button("Generate Python Code", variant="primary")

            with gr.Column():
                output_text = gr.Textbox(
                    label="Generated Python Code",
                    placeholder="Python code will appear here...",
                    lines=4
                )

        examples = [
            "print numbers from 1 to 10",
            "calculate sum of two numbers",
            "find maximum number in list"
        ]

        gr.Examples(examples=examples, inputs=input_text)

        generate_btn.click(fn=generate_code, inputs=input_text, outputs=output_text)

    return interface

interface = create_interface()
interface.launch(share=True)

Model loaded successfully
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bfad437f690e99bba8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import zipfile
import os
from pathlib import Path

# === CONFIGURATION ===
folder_path = "/content/npl task 2"
zip_name = "gpt2_spoc_lora_FULL2.zip"

print("=" * 50)
print("GPT-2 LoRA Model Zipper")
print("=" * 50)

# === CHECK IF FOLDER EXISTS ===
if not os.path.exists(folder_path):
    print(f"\n‚ùå ERROR: Folder nahi mila!")
    print(f"Path: {folder_path}")
    print("\nüîç Current directory files:")
    print(os.listdir('.'))
else:
    # === LIST ALL FILES ===
    files = os.listdir(folder_path)
    total_size = 0

    print(f"\n‚úÖ Folder mil gaya! {len(files)} files hai:\n")

    for f in files:
        file_path = os.path.join(folder_path, f)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
        total_size += size_mb
        print(f"  üìÑ {f:<40} ({size_mb:.2f} MB)")

    print(f"\nüìä Total Size: {total_size:.2f} MB")

    # === CREATE ZIP FILE ===
    print(f"\n‚è≥ Zip bana raha hoon...")

    try:
        with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for filename in files:
                file_path = os.path.join(folder_path, filename)
                zipf.write(file_path, arcname=filename)
                print(f"  ‚úì Added: {filename}")

        # === VERIFY ZIP ===
        zip_size = os.path.getsize(zip_name) / (1024 * 1024)
        print(f"\n‚úÖ ZIP READY!")
        print(f"üì¶ File: {zip_name}")
        print(f"üíæ Size: {zip_size:.1f} MB")

        # === DOWNLOAD INSTRUCTIONS ===
        print("\n" + "=" * 50)
        print("üéâ DOWNLOAD KARNE KE LIYE:")
        print("=" * 50)

        # For Google Colab
        try:
            from google.colab import files
            print("\nüîµ Google Colab detected!")
            print("Niche button se download hoga...")
            files.download(zip_name)
        except ImportError:
            # For Jupyter/Local
            print("\nüìÅ File ready hai:")
            print(f"   Location: {os.path.abspath(zip_name)}")
            print("\nüí° Download karne ke liye:")
            print("   1. File browser mein dekho (left sidebar)")
            print("   2. Right-click ‚Üí Download")
            print("\n   Ya ye command chalaao:")
            print(f'   from IPython.display import FileLink')
            print(f'   FileLink("{zip_name}")')

    except Exception as e:
        print(f"\n‚ùå Error: {str(e)}")

print("\n" + "=" * 50)

GPT-2 LoRA Model Zipper

‚úÖ Folder mil gaya! 9 files hai:

  üìÑ special_tokens_map.json                  (0.00 MB)
  üìÑ vocab.json                               (0.95 MB)
  üìÑ added_tokens.json                        (0.00 MB)
  üìÑ adapter_config.json                      (0.00 MB)
  üìÑ README.md                                (0.00 MB)
  üìÑ adapter_model.safetensors                (297.59 MB)
  üìÑ training_args.bin                        (0.01 MB)
  üìÑ merges.txt                               (0.44 MB)
  üìÑ tokenizer_config.json                    (0.00 MB)

üìä Total Size: 298.99 MB

‚è≥ Zip bana raha hoon...
  ‚úì Added: special_tokens_map.json
  ‚úì Added: vocab.json
  ‚úì Added: added_tokens.json
  ‚úì Added: adapter_config.json
  ‚úì Added: README.md
  ‚úì Added: adapter_model.safetensors
  ‚úì Added: training_args.bin
  ‚úì Added: merges.txt
  ‚úì Added: tokenizer_config.json

‚úÖ ZIP READY!
üì¶ File: gpt2_spoc_lora_FULL2.zip
üíæ Size: 277.0 MB

üéâ DOWNLO

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


