In [3]:
import os, re, sys, torch
from typing import Optional, Tuple
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---------------- USER KNOBS ----------------
BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-0.5B-Instruct")
MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.2"))
TOP_P = float(os.environ.get("TOP_P", "0.95"))
# -------------------------------------------

def has_accelerate() -> bool:
    try:
        import accelerate  # noqa: F401
        return True
    except Exception:
        return False

def try_bitsandbytes_cfg():
    """Return a BitsAndBytesConfig if available and CUDA exists; else None."""
    if not torch.cuda.is_available():
        return None
    try:
        from transformers import BitsAndBytesConfig
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    except Exception:
        return None

def build_prompt(tokenizer, question: str) -> str:
    user = (
        "You are a helpful coding assistant. "
        "Write correct, secure, and concise code for the following task.\n\n"
        f"Task:\n{question}\n\n"
        "Return ONLY code inside a triple-backtick block."
    )
    try:
        return tokenizer.apply_chat_template(
            [{"role": "user", "content": user}],
            tokenize=False,
            add_generation_prompt=True
        )
    except Exception:
        return f"### Instruction:\n{user}\n\n### Response:\n"

CODE_BLOCK_RE = re.compile(r"```([a-zA-Z0-9_+-]*)\s*\n(.*?)```", re.DOTALL)

def extract_code(text: str) -> Tuple[Optional[str], Optional[str]]:
    m = CODE_BLOCK_RE.search(text)
    if not m:
        return None, None
    lang = (m.group(1) or "").strip() or None
    code = m.group(2).strip("\n\r ")
    return lang, code

def suggest_ext(lang: Optional[str]) -> str:
    mapping = {
        "py":"py","python":"py",
        "js":"js","javascript":"js","ts":"ts","typescript":"ts",
        "java":"java","cpp":"cpp","c":"c","c++":"cpp","cs":"cs","csharp":"cs",
        "go":"go","rs":"rs","rust":"rs","php":"php","rb":"rb","ruby":"rb",
        "sh":"sh","bash":"sh","ps1":"ps1","powershell":"ps1",
        "kt":"kt","kotlin":"kt","swift":"swift","sql":"sql","html":"html","css":"css"
    }
    return mapping.get((lang or "").lower(), "txt")

def load_model_and_tokenizer():
    qcfg = try_bitsandbytes_cfg()
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    kwargs = {
        "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        "trust_remote_code": True,
    }
    if qcfg is not None:
        kwargs["quantization_config"] = qcfg

    # Only use device_map="auto" if accelerate is available
    if has_accelerate():
        kwargs["device_map"] = "auto"

    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, **kwargs)

    # If we didn't use device_map, move to a single device manually
    if "device_map" not in kwargs:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

    model.eval()
    return tokenizer, model

def generate_code(question: str, save_to_file: bool = True) -> str:
    tokenizer, model = load_model_and_tokenizer()
    prompt = build_prompt(tokenizer, question)
    inputs = tokenizer([prompt], return_tensors="pt").to(next(model.parameters()).device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            pad_token_id=tokenizer.eos_token_id,
        )

    full = tokenizer.decode(out[0], skip_special_tokens=True)
    if full.startswith(prompt):
        full = full[len(prompt):]

    lang, code = extract_code(full)
    if code is None:
        code, lang = full.strip(), None

    print("\n===== GENERATED CODE =====\n")
    print(code)
    print("\n==========================\n")

    if save_to_file:
        ext = suggest_ext(lang)
        fname = f"generated_code.{ext}"
        with open(fname, "w", encoding="utf-8") as f:
            f.write(code)
        print(f"[Saved] {fname}")

    return code

def main():
    if len(sys.argv) > 1:
        question = " ".join(sys.argv[1:]).strip()
    else:
        question = input("Enter your coding question/task:\n> ").strip()
    if not question:
        print("No question provided.")
        sys.exit(1)
    generate_code(question, save_to_file=True)

if __name__ == "__main__":
    main()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  97%|#########6| 954M/988M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]


===== GENERATED CODE =====

{
  "name": "jupyter",
  "display_name": "Jupyter Notebook",
  "version": "4.8.1",
  "language": "python",
  "envs": [
    {
      "name": "base",
      "prefix": "/usr/local/bin/jupyter"
    }
  ],
  "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "py_version": "3.8",
    "path": "/usr/local/bin/jupyter"
  },
  "language_info": {
    "name": "python",
    "version": "3.8",
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "nbformat": 4,
    "nbformat_minor": 1
  }
}


[Saved] generated_code.txt
