In [None]:
!pip -q install transformers peft accelerate bitsandbytes safetensors
!pip -q install datasets

# Hugging Face login
from huggingface_hub import login
login()


In [None]:
import torch, json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel

BASE = "deepseek-ai/deepseek-coder-6.7b-instruct"
ADAPTER = "jie-jw-wu/clarify-coder"

# Try 4-bit; fallback if not available
try:
    import bitsandbytes as bnb  # noqa
    load_kwargs = dict(load_in_4bit=True, torch_dtype=torch.float16, device_map="auto")
except Exception:
    load_kwargs = dict(torch_dtype=torch.float16, device_map="auto")

tok = AutoTokenizer.from_pretrained(BASE, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(BASE, **load_kwargs)
model = PeftModel.from_pretrained(model, ADAPTER)

gen = pipeline(
    "text-generation",
    model=model,
    tokenizer=tok,
    device_map="auto",
    torch_dtype=torch.float16,
)


In [52]:
import re, csv

def extract_title(pr_text: str) -> str:
    # tries to grab "PR Title: ..." from prompt format; falls back gracefully
    m = re.search(r'PR Title:\s*(.+)', pr_text)
    return m.group(1).strip() if m else pr_text.splitlines()[0][:120]

def ask_questions(pr_text: str, k: int = 2):
    # returns a list of numbered questions
    import re
    prompt = f"""You are analyzing an ambiguous PR description.
Think step-by-step to find missing requirements.
Ask only clarifying questions a reviewer would need.
Return {k} numbered questions. Do not answer them.

PR description:
{pr_text}
"""
    out = gen(
        prompt,
        max_new_tokens=200,
        do_sample=False,
        repetition_penalty=1.05,
    )[0]["generated_text"]
    completion = out[len(prompt):].strip()

    qs = []
    for ln in completion.splitlines():
        ln = ln.strip()
        ln = re.sub(r"^[\-\*\d\.\)\s]+", "", ln)  # strip bullets/numbers
        if ln.endswith("?") and len(ln) > 4:
            qs.append(ln)
    return qs[:k]

In [54]:
prs =[
    {"id": 0, "prompt": "PR Title: Improve list scoring logic\n\nDescription:\nWe need to compute the \"score\" of a list for analytics. Make the scoring correct and efficient."},
{"id": 1, "prompt": "PR Title: Better user status\n\nDescription:\nShow a better \"active\" status for users. It should reflect engagement more accurately."},
{"id": 2, "prompt": "PR Title: Optimize search\n\nDescription:\nMake search faster with caching or something similar. Keep correctness."},
{"id": 3, "prompt": "PR Title: Improve error reporting\n\nDescription:\nWe want clearer errors in the API responses. Make them more helpful."},
{"id": 4, "prompt": "PR Title: Add pagination to transactions API\n\nDescription:\nAdd pagination to /api/transactions."},
{"id": 5, "prompt": "PR Title: Rate limiter for login\n\nDescription:\nAdd a rate limiter to login to prevent abuse."},
{"id": 6, "prompt": "PR Title: Export report to CSV\n\nDescription:\nLet users export the sales report to CSV."},
{"id": 7, "prompt": "PR Title: Image upload validation\n\nDescription:\nAdd validation for uploaded images."},
{"id": 8, "prompt": "PR Title: Update password requirements\n\nDescription:\nMake passwords simpler: only letters. Examples: - \"P@ssw0rd!\" (expected: valid)"},
{"id": 9, "prompt": "PR Title: Normalize timezones\n\nDescription:\nStore all times in local timezone. Comments elsewhere: - \"All backend times must be UTC.\""},
{"id": 10, "prompt": "PR Title: Remove deprecated endpoint\n\nDescription:\nRemove GET /v1/items. Changelog in PR body: - \"Add GET /v1/items docs.\""},
{"id": 11, "prompt": "PR Title: Session timeout policy\n\nDescription:\nIntroduce automatic logout after inactivity. Default timeout is not specified. Apply this to all user roles."}
]

In [None]:
import re, csv

def extract_title(pr_text: str) -> str:
    m = re.search(r'PR Title:\s*(.+)', pr_text)
    return m.group(1).strip() if m else pr_text.splitlines()[0][:120]

out_path = "questions.tsv"

with open(out_path, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
    w.writerow(["id", "pr_title", "clarify_questions"])  # header

    for obj in prs:
        pid = str(obj["id"])
        pr_text = obj.get("pr_text") or obj.get("prompt") or ""
        print(f"working on id={pid} | title={extract_title(pr_text)}")

        qs = ask_questions(pr_text, k=2)  # your numbered-question generator
        print("qs",qs)
        block = "\n".join([f"Q{i}: {q}" for i, q in enumerate(qs, start=1)])  # newline-separated in one cell

        w.writerow([pid, extract_title(pr_text), block])

print("Wrote:", out_path)

In [None]:
import pandas as pd
df = pd.read_csv("questions.tsv", sep="\t")
df.head()
