In [None]:
import os
import json
import random
from PIL import Image, ImageDraw

def create_breakout_folder_structure():
    folders = [
        'breakout_dataset/sprites/paddle',
        'breakout_dataset/sprites/ball',
        'breakout_dataset/sprites/bricks',
        'breakout_dataset/sprites/background',
        'breakout_dataset/mechanics',
        'breakout_dataset/levels'
    ]

    for folder in folders:
        os.makedirs(folder, exist_ok=True)
    print("Folder structure for Breakout created!")

def generate_breakout_sprites():
    # Paddle sprite
    paddle = Image.new('RGBA', (60, 12), (0, 0, 0, 0))
    draw = ImageDraw.Draw(paddle)
    draw.rectangle((0, 0, 60, 12), fill=(0, 0, 255))  # Blue paddle
    paddle.save('breakout_dataset/sprites/paddle/paddle.png')

    # Ball sprite
    ball = Image.new('RGBA', (12, 12), (0, 0, 0, 0))
    draw = ImageDraw.Draw(ball)
    draw.ellipse((0, 0, 12, 12), fill=(255, 0, 0))  # Red ball
    ball.save('breakout_dataset/sprites/ball/ball.png')

    # Bricks (3 colors for levels)
    brick_colors = {'easy': (0, 255, 0), 'medium': (255, 165, 0), 'hard': (255, 0, 0)}
    for level, color in brick_colors.items():
        brick = Image.new('RGBA', (40, 20), (0, 0, 0, 0))
        draw = ImageDraw.Draw(brick)
        draw.rectangle((0, 0, 40, 20), fill=color)
        brick.save(f'breakout_dataset/sprites/bricks/brick_{level}.png')

    # Background
    bg = Image.new('RGB', (320, 480), (30, 30, 30))  # Dark gray
    draw = ImageDraw.Draw(bg)
    draw.text((10, 10), "Breakout", fill=(255, 255, 255))
    bg.save('breakout_dataset/sprites/background/bg.png')

def create_breakout_mechanics():
    mechanics = {
        "physics": {
            "paddle_speed": 5,
            "ball_speed": 4,
            "bounce_loss": 0
        },
        "collision": {
            "ball_radius": 6,
            "paddle_width": 60,
            "brick_width": 40,
            "brick_height": 20
        },
        "screen": {
            "width": 320,
            "height": 480
        }
    }
    with open('breakout_dataset/mechanics/game_rules.json', 'w') as f:
        json.dump(mechanics, f, indent=2)

def generate_breakout_levels():
    levels = {"easy": [], "medium": [], "hard": []}

    for difficulty in levels:
        rows = {"easy": 3, "medium": 5, "hard": 7}[difficulty]
        for row in range(rows):
            for col in range(6):  # 6 columns of bricks
                brick = {
                    "x": col * 50 + 10,
                    "y": row * 25 + 40,
                    "type": difficulty
                }
                levels[difficulty].append(brick)

    with open('breakout_dataset/levels/brick_layouts.json', 'w') as f:
        json.dump(levels, f, indent=2)


def create_breakout_config():
    config = {
        "game_name": "breakout",
        "resolution": {"width": 320, "height": 480},
        "assets": {
            "paddle": "sprites/paddle/paddle.png",
            "ball": "sprites/ball/ball.png",
            "bricks": {
                "easy": "sprites/bricks/brick_easy.png",
                "medium": "sprites/bricks/brick_medium.png",
                "hard": "sprites/bricks/brick_hard.png"
            },
            "background": "sprites/background/bg.png"
        },
        "default_difficulty": "medium",
        "version": "1.0"
    }

    with open('breakout_dataset/config.json', 'w') as f:
        json.dump(config, f, indent=2)

if __name__ == "__main__":
    print("Creating Breakout synthetic dataset...")
    create_breakout_folder_structure()
    generate_breakout_sprites()
    create_breakout_mechanics()
    generate_breakout_levels()
    create_breakout_config()
    print("Breakout dataset created successfully!")
    os.system('tree breakout_dataset')


Creating Breakout synthetic dataset...
Folder structure for Breakout created!
Breakout dataset created successfully!


In [None]:
import shutil


folder_name = "breakout_dataset"
shutil.make_archive(folder_name, 'zip', folder_name)


'/content/breakout_dataset.zip'

In [None]:
from google.colab import files


In [None]:
import json
import os


configs = []
for root, _, files in os.walk("breakout_dataset"):
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(root, file), "r") as f:
                configs.append(json.load(f))

with open("breakout_train.txt", "w") as f:
    for config in configs:
        f.write(json.dumps(config) + "\n")

In [None]:
#!pip install -q transformers datasets torch accelerate sentencepiece bitsandbytes jsbeautifier

In [None]:
import json

training_data =[
    {
        "python": "screen = pygame.display.set_mode((800, 600))",
        "javascript": "const canvas = document.getElementById('gameCanvas');\nconst ctx = canvas.getContext('2d');\ncanvas.width = 800;\ncanvas.height = 600;"
    },
    {
        "python": "pygame.draw.rect(screen, RED, (x, y, 20, 20))",
        "javascript": "ctx.fillStyle = 'red';\nctx.fillRect(x, y, 20, 20);"
    },
    {
        "python": "for event in pygame.event.get():\n    if event.type == pygame.QUIT:",
        "javascript": "canvas.addEventListener('keydown', (e) => {\n    if (e.key === 'Escape') {"
    }
]

with open("pygame_to_js.json", "w") as f:
    json.dump(training_data, f)

In [None]:
!pip install -U transformers bitsandbytes



In [None]:
hf_token = "Enter your hugging face token"


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import AutoTokenizer

df = pd.read_json("pygame_to_js.json")

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-1b")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Concise format for 1B model
    prompts = [
        f"PYGAME: {examples['python'][i]}\nJS: {examples['javascript'][i]}"
        for i in range(len(examples['python']))
    ]
    return tokenizer(prompts, truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
#pip install -U bitsandbytes

In [None]:
from transformers import BitsAndBytesConfig, TrainingArguments, AutoModelForCausalLM
import torch # Import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "bigcode/starcoderbase-1b", # Use the smaller model
    quantization_config=bnb_config,
    device_map="auto",
    offload_buffers=True # Offload buffers to mitigate memory issues
)

model.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:

import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType


df = pd.read_json("pygame_to_js.json")
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)


tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-1b")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):

    prompts = [
        f"Convert the following Pygame code to JavaScript:\nPYGAME: {examples['python'][i]}\nJAVASCRIPT: {examples['javascript'][i]}{tokenizer.eos_token}"
        for i in range(len(examples['python']))
    ]

    tokenized = tokenizer(
        prompts,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )


    tokenized["labels"] = tokenized["input_ids"].clone()

    )
    for i in range(len(tokenized["input_ids"])):

        javascript_tokens = tokenizer.encode("JAVASCRIPT:")
        try:
            js_start = tokenized["input_ids"][i].tolist().index(javascript_tokens[-1]) + 1
        except ValueError:

            js_start = 0

        tokenized["labels"][i][:js_start] = -100

    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True)


print("Tokenized dataset structure:", tokenized_dataset)
print("Sample labels:", tokenized_dataset["train"]["labels"][0])

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "bigcode/starcoderbase-1b",
    quantization_config=bnb_config,
    device_map="auto",

)

model.config.use_cache = False

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_proj", "c_attn", "q_attn"]  # Specific to StarCoder architecture
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="pygame_converter",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=3e-4,
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none",
    lr_scheduler_type="cosine",
    warmup_steps=100,
    weight_decay=0.05
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("pygame_converter_1b")

# Save tokenizer as well
tokenizer.save_pretrained("pygame_converter_1b")

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenized dataset structure: DatasetDict({
    train: Dataset({
        features: ['python', 'javascript', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
    test: Dataset({
        features: ['python', 'javascript', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})
Sample labels: [-100, -100, -100, -100, -100, -100, -100, -100, -100, 203, 5177, 25425, 44, 4809, 280, 23978, 32, 2751, 32, 470, 81, 2816, 1162, 42, 34, 34, 30, 225, 40, 34, 34, 490, 203, 60, 4570, 1099, 38225, 44, 660, 9086, 280, 1825, 32, 5616, 463, 3626, 8511, 895, 203, 913, 3692, 280, 9086, 32, 14732, 463, 36, 86, 895, 203, 8031, 32, 1644, 280, 225, 42, 34, 34, 45, 203, 8031, 32, 2273, 280, 225, 40, 34, 34, 45, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 3,588,096 || all params: 1,140,795,392 || trainable%: 0.3145


Step,Training Loss


('pygame_converter_1b/tokenizer_config.json',
 'pygame_converter_1b/special_tokens_map.json',
 'pygame_converter_1b/vocab.json',
 'pygame_converter_1b/merges.txt',
 'pygame_converter_1b/added_tokens.json',
 'pygame_converter_1b/tokenizer.json')

In [None]:
import shutil


shutil.make_archive('pygame_converter_1b', 'zip', 'pygame_converter_1b')

from google.colab import files
files.download('pygame_converter_1b.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
"""from google.colab import files
files.download("pygame_converter_1b")"""

'from google.colab import files\nfiles.download("pygame_converter_1b")'