In [1]:
import json
from tqdm import tqdm
from collections import Counter

from unsloth import FastLanguageModel
import torch

# === 모델 설정 ===
max_seq_length = 5500
load_in_4bit = True
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/Llama-3.1-8B-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit   = load_in_4bit,
    dtype          = dtype,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

adapter_path = "/shared/s1/lab11/BigqueryML/SFT_fine_tuned_llama_3.1_8B"
model.load_adapter(adapter_path, adapter_name="default")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval().to(device)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-20 03:52:31 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-20 03:52:31 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.5.post1.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.5.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [2]:
# === I/O 설정 ===
input_path  = "/shared/s1/lab11/BigqueryML/test_set_small.jsonl"
output_path = "/shared/s1/lab11/BigqueryML/test_set_small_inference_self_consistency(16).jsonl"

# === Prompt 생성 ===
def make_prompt(instruction, input_table):
    return f"""
We aim to extract structured machine learning configuration arguments and conditions from natural language question and a given data dictionary.
These arguments are essential to automatically generate BigQuery ML SQL code.
The output must strictly follow the specified format and use the keys as described below.

### Output Format:
The output should be a JSON object containing the following keys:

1. **time_series** (boolean): Indicates whether the model is intended for time series forecasting.
   - Example: "time_series": "False"
   - Use "True" if Input involves time columns such as "Date", otherwise "False".

2. **target_column** (string): The column name that represents the target variable to predict.
   - Use the format: "<col>column_name</col>"
   - Example: "target_column": "<col>clarity</col>"

3. **inference_condition** (list of strings): A list of conditions used for inference or prediction. Each condition should specify a column, an operator, and a value.
   - Use the format: "<col>column_name</col><op>operator</op><val>value</val>"
   - Multiple conditions can be provided as a list.
   - Example: "inference_condition": ["<col>carat</col><op>>=</op><val>1.0</val>", "<col>color</col><op>=</op><val>J</val>"]

4. **update_condition** (list of strings, optional): A list of conditions for updating the data or model. Similar to `inference_condition`, it specifies column, operator, and value.
   - Example: "update_condition": ["<col>color</col><op>=</op><val>G</val>"]
   - - If there is no change in the conditions as per the instruction, this key should not be generated.

5. **task** (string): The type of machine learning task to perform.
   - Common values: "classification", "regression", "clustering", "anomaly_detection"
   - Example: "task": "classification"

### Natural Language Question:
{instruction}

### Data Dictionary:
{input_table}
"""

In [3]:
# === Self-Consistency Inference 함수 ===
from collections import Counter

import json
import re
from pathlib import Path
from typing import Optional

PAT = re.compile(
    r"^\s*###\s*Output:?\s*[\r\n]+(.*?)(?=^\s*###|\Z)",
    flags=re.I | re.S | re.M,
)

def extract_output_block(pred_str: str) -> Optional[str]:
    """
    예측 문자열(pred_str)에서 '### Output:' 블록을 찾아 반환.
    없으면 None.
    """
    m = PAT.search(pred_str)
    return m.group(1).strip() if m else None

def infer_self_consistency(
    prompt: str,
    num_samples: int = 2,
    temperature: float = 0.7,
    top_p: float = 0.8,
    max_new_tokens: int = 2700
):
    # 1) 프롬프트 토큰화
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # 2) 샘플 생성
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            num_return_sequences=num_samples,
            return_dict_in_generate=True,
            output_scores=False,
        )
    seqs = outputs.sequences  # (num_samples, seq_len)

    # 3) 디코딩 + '### Output:' 블록 추출
    samples = []
    for seq in seqs:
        decoded = tokenizer.decode(seq, skip_special_tokens=True)
        cleaned  = extract_output_block(decoded)     # ⬅️ 변경된 부분
        # '### Output:' 블록이 없으면 전체 문자열을 사용
        samples.append(cleaned if cleaned is not None else decoded.strip())

    # 4) 최다 득표(best) 선택
    best_output, _ = Counter(samples).most_common(1)[0]

    # 5) 전체 토큰 합계 계산 (prompt + generated 모두)
    total_full_tokens = sum(seq.shape[-1] for seq in seqs)

    return best_output, total_full_tokens, samples


In [4]:
# === JSONL 처리 ===
with open(input_path, "r") as infile, open(output_path, "w") as outfile:
    for line in tqdm(infile):
        ex = json.loads(line)
        instruction = ex["instruction"]
        input_table = ex["input"]
        ground_truth = ex.get("output")

        prompt = make_prompt(instruction, input_table)
        try:
            best, total_tokens, all_samples = infer_self_consistency(
                prompt,
                num_samples=16,      # 원하시는 만큼 샘플 수 조정
                temperature=0.8,
                top_p=0.9,
            )
        except Exception as e:
            best, total_tokens, all_samples = f"ERROR: {e}", 0, []

        result = {
            "instruction": instruction,
            "input": input_table,
            "ground_truth": ground_truth,
            "selfconsistency_best": best,
            "total_tokens": total_tokens,
            "all_samples": all_samples
        }

        outfile.write(json.dumps(result, ensure_ascii=False) + "\n")

200it [2:50:16, 51.08s/it]


In [None]:
# EVAL

import re
from difflib import SequenceMatcher


class RewardCalculator:
    def __init__(
        self,
        weights = None,
        use_fuzzy_matching = True,
        normalize = True,  # True means `filter` mode other than `grpo` mode
        verbose = False
    ):
        self.use_fuzzy_matching = use_fuzzy_matching
        self.verbose = verbose
        self.normalize = normalize
        self.fail_val = 0.0 if normalize else -1.0

        # Fixed weights
        self.weights = weights or {
            "time_series": 0.1,
            "target_column": 0.5,
            "inference_condition": 0.3,
            "update_condition": 0.4,
            "task": 0.7
        }

        # Required keys
        self.required_keys = ["time_series", "target_column", "inference_condition", "task"]


    def _match(self, a, b, key=None):

        def jaccard(set1, set2):
            intersection = len(set1 & set2)
            union = len(set1 | set2)
            if union != 0:
                # Jaccard similarity
                result = intersection / union
                if result == 0.0:
                    return self.fail_val
                return result
            else:
                return self.fail_val

        def extract_condition_parts(condition_str):
            condition_str = condition_str.strip()

            col = re.findall(r"<col>(.*?)</col>", condition_str)
            op = re.findall(r"<op>(.*?)</op>", condition_str)
            val = re.findall(r"<val>(.*?)</val>", condition_str)

            col_val = col[0].strip() if col else ""
            op_val = op[0].strip() if op else ""
            val_val = val[0].strip() if val else ""

            # Step 2: Heuristics if any of the parts are missing
            if not (col_val and op_val and val_val):
                # Remove tags to simplify raw parsing
                clean_str = re.sub(r"</?[^>]+>", "", condition_str)

                # Try simple expression pattern: col op val
                match = re.match(r"([a-zA-Z0-9_.]+)\s*([=!<>]+)\s*(.+)", clean_str)
                if match:
                    if not col_val:
                        col_val = match.group(1).strip()
                    if not op_val:
                        op_val = match.group(2).strip()
                    if not val_val:
                        val_val = match.group(3).strip()

            return col_val, op_val, val_val

        def tag_completeness_score(cond_str, tags):
            present = sum(tag in cond_str for tag in tags)
            return present / len(tags)

        def score_pair(a_cond, b_cond):
            a_cond = a_cond.strip().lower()
            b_cond = b_cond.strip().lower()
            a_col, a_op, a_val = extract_condition_parts(a_cond)
            b_col, b_op, b_val = extract_condition_parts(b_cond)

            col_score = jaccard({a_col}, {b_col})
            op_score = jaccard({a_op}, {b_op})
            val_score = int(SequenceMatcher(None, a_val, b_val).ratio() >= 0.9)

            if col_score == 0 or op_score == 0 or val_score == 0:
                return self.fail_val

            avg_score = (col_score + op_score + val_score) / 3
            tag_score = tag_completeness_score(a_cond, ["<col>", "</col>", "<op>", "</op>", "<val>", "</val>"])
            if tag_score != 1.0:
                return self.fail_val

            return avg_score

        if key in {"inference_condition", "update_condition"}:
            a_list = a if isinstance(a, list) else [a]
            b_list = b if isinstance(b, list) else [b]

            if not a_list and not b_list:
                return 1.0
            if not a_list or not b_list:
                return self.fail_val

            used_b_indices = set()
            matched_scores = []

            for a_cond in a_list:
                best_score = self.fail_val
                best_j = None
                for j, b_cond in enumerate(b_list):
                    if j in used_b_indices:
                        continue
                    score = score_pair(a_cond, b_cond)
                    if score > best_score:
                        best_score = score
                        best_j = j
                if best_j is not None:
                    used_b_indices.add(best_j)
                matched_scores.append(best_score)  # score is 0.0 if unmatched

            # Final score is average over max(len(predicted), len(ground_truth))
            final_score = sum(matched_scores) / max(len(a_list), len(b_list))
            return final_score

        # Default Jaccard (for all non-condition fields)
        a_str = " ".join(map(str, a)) if isinstance(a, list) else str(a)
        b_str = " ".join(map(str, b)) if isinstance(b, list) else str(b)

        if not a_str.strip() and not b_str.strip():
            return 1.0
        if not a_str.strip() or not b_str.strip():
            return self.fail_val

        if key == "target_column":

            def strip_tags(text):
                return re.sub(r"</?[^>]+>", "", text).strip().lower()

            a_clean = strip_tags(a_str)
            b_clean = strip_tags(b_str)

            tag_score = tag_completeness_score(a_str, ["<col>", "</col>"])
            if tag_score != 1.0:
                return self.fail_val

            sim_score = jaccard({a_clean}, {b_clean})
            return sim_score

        a_tokens = set(a_str.lower().split())
        b_tokens = set(b_str.lower().split())

        return jaccard(a_tokens, b_tokens)


    def weighted_accuracy(self, predicted, ground_truth):
        """Computes weighted accuracy + diagnostics"""
        matches = {}
        diagnostics = {}

        weights = self.weights.copy()

        # Required keys
        for key in self.required_keys:
            sim_score = self._match(predicted.get(key, []), ground_truth.get(key, []), key=key)
            matches[key] = sim_score
            diagnostics[key] = weights.get(key, 0) * sim_score

        # Optional key: update_condition
        has_update_condition = "update_condition" in ground_truth and (ground_truth.get("update_condition") not in (None, []))
        if has_update_condition:
            if predicted.get("update_condition") is None:
                predicted['update_condition'] = []
            sim_score = self._match(
                predicted.get("update_condition", []),
                ground_truth.get("update_condition", []),
                key="update_condition"
            )
            matches["update_condition"] = sim_score
            diagnostics["update_condition"] = weights.get("update_condition") * sim_score

        # Dynamically decide which keys to include in normalization
        active_keys = self.required_keys.copy()
        if has_update_condition:
            active_keys.append("update_condition")

        if self.normalize:
            # filter mode: normalize the score
            max_possible_score = sum(weights.get(k, 0) for k in active_keys)
            weighted_score = sum(diagnostics.values())
            final_score = max(0.0, min(1.0, weighted_score / max_possible_score))
        else:
            # grpo mode: No normalization
            final_score = sum(diagnostics.values())

        if self.verbose:
            print("[Reward Diagnostics]")
            print("Matches:", matches)
            print("Diagnostics (per-key contribution):", diagnostics)
            print("Weighted Score:", final_score)

        return round(final_score, 6), matches, diagnostics

    def get_min_possible_reward(self, ground_truth, convert_to_max=False):
        w = sum([self.weights.get(k) for k in ground_truth.keys() if self.weights.get(k) is not None])
        r = round(self.fail_val * w, 6)
        return -r if convert_to_max else r

    def self_check(self, intermediate_output):
        """Checks for presence of required keys."""
        for key in self.required_keys:
            if key not in intermediate_output:
                return {"status": "INVALID", "reason": f"Missing required key: {key}"}
        return {"status": "VALID", "intermediate_output": intermediate_output}

In [None]:
# evaluator/evaluator.py

class Evaluator:
    def __init__(self, mode: str = "filter", threshold: float = 0.9, verbose: bool = False):
        """
        Args:
            mode: 'filter' or 'grpo'
            threshold: filter cutoff (used only in filter mode)
        """
        assert mode in ("filter", "grpo")
        self.mode = mode
        self.threshold = threshold
        self.calculator = RewardCalculator(verbose=verbose, normalize=True if mode == "filter" else False)

    def evaluate(self, predicted: dict, ground_truth: dict) -> dict:
        validity = self.calculator.self_check(predicted)
        if validity["status"] != "VALID":
            result = {"score": 0.0, "diagnostics": {"status": validity["status"], "reason": validity["reason"]}}
            if self.mode == "filter":
                return {"keep": False, **result}
            elif self.mode == "grpo":
                return {"reward": self.calculator.get_min_possible_reward(ground_truth), **result}

        score, matches, diagnostics = self.calculator.weighted_accuracy(predicted, ground_truth)

        if self.mode == "filter":
            return {
                "keep": score >= self.threshold,
                "score": score,
                "diagnostics": {
                    "matches": matches,
                    "contributions": diagnostics,
                    "total": score
                }
            }
        elif self.mode == "grpo":
            return {
                "reward": score,
                "diagnostics": {
                    "matches": matches,
                    "contributions": diagnostics,
                    "total": score
                }
            }

In [None]:
import json
import ast
from pathlib import Path
from tqdm import tqdm

# ─── 1) 파일 경로 설정 ─────────────────────────────────────────────
INPUT_PATH  = Path("/shared/s1/lab11/BigqueryML/test_set_small_inference_self_consistency(16).jsonl")
OUTPUT_PATH = INPUT_PATH.with_name(INPUT_PATH.stem + "_with_best_of_n.jsonl")

# ─── 2) 평가기 준비 (filter vs grpo 원하는 모드 선택) ──────────────
evaluator = Evaluator(mode="filter", verbose=False)   # 점수만 필요하면 grpo

# ─── 3) 유틸: 문자열(dict 형태) → 파이썬 dict 변환 ───────────────
def str_to_dict(sample_str: str) -> dict:
    """
    all_samples 안에 들어 있는 단일/이중따옴표 혼합 문자열을
    안전하게 dict로 변환. ast.literal_eval 사용.
    """
    try:
        return ast.literal_eval(sample_str)
    except Exception as e:
        raise ValueError(f"샘플 문자열 파싱 실패: {e}\n{sample_str}")

# ─── 4) 메인 루프 ──────────────────────────────────────────────────
with INPUT_PATH.open(encoding="utf-8") as fin, \
     OUTPUT_PATH.open("w", encoding="utf-8") as fout:

    for line in tqdm(fin, desc="rerank"):
        record = json.loads(line)

        ground_truth = record.get("ground_truth", {})
        samples_str  = record.get("all_samples", [])

        # 각 샘플 평가
        scored = []
        for s in samples_str:
            pred_dict = str_to_dict(s)
            result = evaluator.evaluate(pred_dict, ground_truth)
            # grpo 모드면 result["reward"], filter 모드면 result["score"]
            score_key = "reward" if evaluator.mode == "grpo" else "score"
            score = result.get(score_key, 0.0)
            scored.append((score, s))

        # 최고 점수 샘플 선택 (tie 시 첫 번째)
        scored.sort(key=lambda x: x[0], reverse=True)
        best_sample_str = scored[0][1]
        best_score      = scored[0][0]

        # 새 필드 추가
        record["best_of_n"]      = best_sample_str
        record["best_of_n_score"] = best_score   # 디버깅용 (원치 않으면 삭제)

        fout.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✅ 저장 완료 → {OUTPUT_PATH}")
