<a href="https://colab.research.google.com/github/prathamesh170206/DarkPhantom.github.io/blob/main/new_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import files
uploaded = files.upload()

Saving test_prompts_orders.json to test_prompts_orders.json


In [10]:
import os
# Disable external logging/telemetry that prompts for API keys
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

import json
import random
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from collections import Counter

# -------------------------------
# Config
# -------------------------------
data_path = "/content/sample_data/hq_orders_augmented_2000.json"
model_save_path = "/content/saved_models"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

POKEMON = ["Bulbasaur", "Charizard", "Pikachu", "Mewtwo"]

# Manual label mapping to preserve order
label2idx = {label: idx for idx, label in enumerate(POKEMON)}
idx2label = {idx: label for idx, label in enumerate(POKEMON)}

print("POKEMON list order:", POKEMON)
print("Manual label2idx mapping:", label2idx)

# Seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

# Load data
with open(data_path) as f:
    hq_orders = json.load(f)

random.shuffle(hq_orders)
test_orders = hq_orders[:100]
val_orders = hq_orders[100:200]
train_orders = hq_orders[200:]

# MultiLabelBinarizer for protect labels
mlb_protect = MultiLabelBinarizer(classes=POKEMON)
mlb_protect.fit([POKEMON])

# Dataset class with NO .to(device) in __getitem__
class HQDataset(Dataset):
    def __init__(self, data, tokenizer, task="target", max_length=128):
        self.prompts = [d["prompt"] for d in data]
        self.task = task
        self.tokenizer = tokenizer
        self.max_length = max_length

        if task == "target":
            self.labels = [label2idx[d["target"]] for d in data]
        elif task == "protect":
            self.labels = []
            for d in data:
                protected = d.get("protected", [])
                if isinstance(protected, str):
                    protected = [protected]
                elif protected is None:
                    protected = []
                vec = mlb_protect.transform([protected])[0]
                self.labels.append(vec)
        else:
            raise ValueError("Task must be 'target' or 'protect'")

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        prompt = self.prompts[idx]
        encoding = self.tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item["labels"] = torch.tensor(
            self.labels[idx],
            dtype=torch.long if self.task == "target" else torch.float
        )
        return item

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_dataset_target = HQDataset(train_orders, tokenizer, task="target")
val_dataset_target = HQDataset(val_orders, tokenizer, task="target")
test_dataset_target = HQDataset(test_orders, tokenizer, task="target")

train_dataset_protect = HQDataset(train_orders, tokenizer, task="protect")
val_dataset_protect = HQDataset(val_orders, tokenizer, task="protect")
test_dataset_protect = HQDataset(test_orders, tokenizer, task="protect")

# Helper: load fine-tuned model if available, else init base
def load_or_init_target():
    ckpt_dir = model_save_path + "_target"
    if os.path.isdir(ckpt_dir) and os.path.isfile(os.path.join(ckpt_dir, "config.json")):
        print(f"Loading fine-tuned target model from {ckpt_dir}")
        return RobertaForSequenceClassification.from_pretrained(ckpt_dir).to(device)
    else:
        print("No fine-tuned target checkpoint found. Initializing from roberta-base.")
        return RobertaForSequenceClassification.from_pretrained(
            "roberta-base", num_labels=len(POKEMON)
        ).to(device)

def load_or_init_protect():
    ckpt_dir = model_save_path + "_protect"
    if os.path.isdir(ckpt_dir) and os.path.isfile(os.path.join(ckpt_dir, "config.json")):
        print(f"Loading fine-tuned protect model from {ckpt_dir}")
        return RobertaForSequenceClassification.from_pretrained(ckpt_dir).to(device)
    else:
        print("No fine-tuned protect checkpoint found. Initializing from roberta-base.")
        return RobertaForSequenceClassification.from_pretrained(
            "roberta-base", num_labels=len(POKEMON), problem_type="multi_label_classification"
        ).to(device)

# Models
model_target = load_or_init_target()
model_protect = load_or_init_protect()

# Class weights calculation
from collections import Counter
counts = Counter([d["target"] for d in train_orders])
total = sum(counts.values())
class_weights = [total / counts[cls] for cls in POKEMON]
class_weights_tensor = torch.tensor(class_weights).to(device)
print(f"Class weights: {class_weights}")

# Custom Trainer with weighted loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if model.config.problem_type == "multi_label_classification":
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)
        else:
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Metrics
def compute_metrics_target(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

def compute_metrics_protect(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).numpy()
    acc = np.mean(np.all(preds == labels, axis=1))
    return {"exact_match_accuracy": acc}

# Training args with report_to="none" to silence external integrations
training_args_target = TrainingArguments(
    output_dir=model_save_path + "_target",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=model_save_path + "_logs_target",
    learning_rate=1e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    gradient_accumulation_steps=2,
    report_to="none"
)

training_args_protect = TrainingArguments(
    output_dir=model_save_path + "_protect",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=model_save_path + "_logs_protect",
    learning_rate=1e-5,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match_accuracy",
    greater_is_better=True,
    fp16=True,
    gradient_accumulation_steps=2,
    report_to="none"
)

trainer_target = WeightedTrainer(
    model=model_target,
    args=training_args_target,
    train_dataset=train_dataset_target,
    eval_dataset=val_dataset_target,
    compute_metrics=compute_metrics_target
)

trainer_protect = Trainer(
    model=model_protect,
    args=training_args_protect,
    train_dataset=train_dataset_protect,
    eval_dataset=val_dataset_protect,
    compute_metrics=compute_metrics_protect
)

# Train or skip; if skipped, we still have loaded fine-tuned models if present
#trainer_target.train()
trainer_target.save_model(model_save_path + "_target")
#trainer_protect.train()
trainer_protect.save_model(model_save_path + "_protect")

print("Setup complete. Models are ready (loaded from checkpoints if available).")


# Interactive predict loop
print("\n=== HQ Order Prediction ===\nType your HQ order prompt and press Enter. Type 'exit' to quit.\n")

while True:
    user_input = input("Enter HQ order prompt: ")
    if user_input.lower() == "exit":
        print("Exiting...")
        break

    encoding = tokenizer(
        user_input,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
    encoding = {k: v.to(device) for k, v in encoding.items()}

    model_target.eval()
    with torch.no_grad():
        output_target = model_target(**encoding)
        logits = output_target.logits.cpu().numpy()
        target_idx = int(np.argmax(logits, axis=1)[0])
        target_pokemon = idx2label[target_idx]

    model_protect.eval()
    with torch.no_grad():
        output_protect = model_protect(**encoding)
        probs = torch.sigmoid(output_protect.logits)
        protected_idx = (probs > 0.5).cpu().numpy()
        protected_pokemon = mlb_protect.inverse_transform(protected_idx)[0]

    print(f"\nPredicted Target: {target_pokemon}")



Using device: cuda
POKEMON list order: ['Bulbasaur', 'Charizard', 'Pikachu', 'Mewtwo']
Manual label2idx mapping: {'Bulbasaur': 0, 'Charizard': 1, 'Pikachu': 2, 'Mewtwo': 3}
Loading fine-tuned target model from /content/saved_models_target
Loading fine-tuned protect model from /content/saved_models_protect
Class weights: [4.209445585215605, 3.4569983136593594, 4.141414141414141, 4.315789473684211]
Setup complete. Models are ready (loaded from checkpoints if available).
UTF-8 load failed: 'utf-8' codec can't decode byte 0x92 in position 13990: invalid start byte
Loaded 200 prompts from /content/test_prompts_orders.json
Wrote 200 targets to /content/saved_models

=== HQ Order Prediction ===
Type your HQ order prompt and press Enter. Type 'exit' to quit.

Enter HQ order prompt: HQ REPORT: Situation analysis regarding unusual activity of scaled fire titan in this operational zone. Draw minimal bloodline; photographic evidence is priority. Use thermal masking as a decoy if pursuit is necessa


KeyboardInterrupt



In [None]:
import shutil
zip_base = "/content/saved_models_target"
zip_path = "/content/saved_models_target.zip"
shutil.make_archive(zip_base, "zip", zip_base)  # creates /content/saved_models_target.zip

from google.colab import files
files.download(zip_path)


In [19]:
import os
# Disable external logging/telemetry
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

import json
import random
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from collections import Counter

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

# -------------------------------
# Config
# -------------------------------
data_path = "/content/sample_data/hq_orders_augmented_2000.json"
model_save_path = "/content/saved_models_2"  # base path (will append _target)
target_ckpt_dir = model_save_path + "_target"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

POKEMON = ["Bulbasaur", "Charizard", "Pikachu", "Mewtwo"]
label2idx = {label: idx for idx, label in enumerate(POKEMON)}
idx2label = {idx: label for idx, label in enumerate(POKEMON)}
print("POKEMON list order:", POKEMON)
print("Manual label2idx mapping:", label2idx)

# -------------------------------
# Reproducibility
# -------------------------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# -------------------------------
# Load data
# -------------------------------
with open(data_path, "r", encoding="utf-8") as f:
    hq_orders = json.load(f)

texts = [d["prompt"] for d in hq_orders]
targets = [d["target"] for d in hq_orders]

# Stratified 80/10/10 split
X_train, X_temp, y_train, y_temp = train_test_split(
    texts, targets, test_size=0.2, random_state=42, stratify=targets
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

train_orders = [{"prompt": p, "target": t} for p, t in zip(X_train, y_train)]
val_orders   = [{"prompt": p, "target": t} for p, t in zip(X_val, y_val)]
test_orders  = [{"prompt": p, "target": t} for p, t in zip(X_test, y_test)]

# -------------------------------
# Dataset
# -------------------------------
MAX_LEN = 192  # increase if VRAM allows

class HQDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=MAX_LEN):
        self.prompts = [d["prompt"] for d in data]
        self.labels = [label2idx[d["target"]] for d in data]
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.prompts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.prompts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# -------------------------------
# Tokenizer & Model (DeBERTa v3 base)
# -------------------------------
base_model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

train_dataset = HQDataset(train_orders, tokenizer, max_length=MAX_LEN)
val_dataset   = HQDataset(val_orders, tokenizer, max_length=MAX_LEN)
test_dataset  = HQDataset(test_orders, tokenizer, max_length=MAX_LEN)

def load_or_init_model():
    if os.path.isdir(target_ckpt_dir) and os.path.isfile(os.path.join(target_ckpt_dir, "config.json")):
        print(f"Loading fine-tuned model from {target_ckpt_dir}")
        return AutoModelForSequenceClassification.from_pretrained(target_ckpt_dir).to(device)
    else:
        print("No checkpoint found. Initializing from base.")
        return AutoModelForSequenceClassification.from_pretrained(
            base_model_name, num_labels=len(POKEMON)
        ).to(device)

model = load_or_init_model()

# -------------------------------
# Class weights from train only (capped)
# -------------------------------
counts = Counter(y_train)
total = sum(counts.values())
max_w = 5.0
class_weights = [min(total / counts[c], max_w) for c in POKEMON]
class_weights_tensor = torch.tensor(class_weights).to(device)
print("Class counts:", dict(counts))
print("Class weights (capped):", class_weights)

# -------------------------------
# Metrics
# -------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

# -------------------------------
# Trainer with safe loss (no fp16, no checkpointing)
# -------------------------------
class WeightedTrainer(Trainer):
    def __init__(self, *args, label_smoothing=0.05, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_smoothing = float(label_smoothing)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Extract labels and ensure correct type/device
        labels = inputs.pop("labels")
        labels = labels.to(model.device).long().contiguous()

        # Forward pass on remaining inputs
        outputs = model(**inputs)
        logits = outputs.logits  # [B, C]

        # Fresh class weights each step on correct device
        cw = class_weights_tensor.to(logits.device)

        if self.label_smoothing > 0.0:
            num_classes = logits.size(-1)
            smooth = self.label_smoothing
            with torch.no_grad():
                true_dist = torch.full_like(logits, smooth / (num_classes - 1))
                true_dist.scatter_(1, labels.unsqueeze(1), 1.0 - smooth)
            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
            loss_per_class = -(true_dist * log_probs)                 # no grad via true_dist
            loss = (loss_per_class * cw.unsqueeze(0)).sum(-1).mean()  # weighted mean
        else:
            ce = torch.nn.CrossEntropyLoss(weight=cw)
            loss = ce(logits.view(-1, logits.size(-1)), labels.view(-1))

        if return_outputs:
            return loss, outputs
        return loss

# -------------------------------
# TrainingArguments with fixed LR ("vectorizer"-style)
# -------------------------------
training_args = TrainingArguments(
    output_dir=target_ckpt_dir,
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=2e-5,            # fixed LR, no scheduler
    lr_scheduler_type="constant",  # fixed LR
    warmup_ratio=0.0,              # no warmup
    weight_decay=0.0,              # no weight decay
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    fp16=False,
    bf16=False,
    gradient_checkpointing=False,
    report_to="none",
    seed=42
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    label_smoothing=0.05
)

# -------------------------------
# Train, save, evaluate
# -------------------------------
#trainer.train()
trainer.save_model(target_ckpt_dir)

val_metrics = trainer.evaluate(eval_dataset=val_dataset)
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Validation metrics:", val_metrics)
print("Test metrics:", test_metrics)

# -------------------------------
# Reload for inference (explicit)
# -------------------------------
print("\nReloading best checkpoint for inference...")
inference_model = AutoModelForSequenceClassification.from_pretrained(target_ckpt_dir).to(device)
inference_model.eval()

import os
import json
import torch
from torch.utils.data import Dataset, DataLoader

# Windows paths
input_json = "/content/test_prompts_orders.json"
output_txt = "/content/saved_models_2"

def load_json_robust(path):
    # Try UTF-8 first (with BOM strip)
    try:
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        if text and text[0] == "\ufeff":
            text = text.lstrip("\ufeff")
        return json.loads(text)
    except Exception as e_utf8:
        print(f"UTF-8 load failed: {e_utf8}")

    # Try cp1252 (Windows-1252)
    try:
        with open(path, "r", encoding="cp1252", errors="strict") as f:
            text = f.read()
        if text and text[0] == "\ufeff":
            text = text.lstrip("\ufeff")
        return json.loads(text)
    except Exception as e_cp:
        print(f"cp1252 load failed: {e_cp}")

    # Try JSON Lines with UTF-8
    items = []
    try:
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                items.append(json.loads(line))
        if items:
            print("Loaded as UTF-8 JSONL")
            return items
    except Exception as e_jl_utf8:
        print(f"UTF-8 JSONL failed: {e_jl_utf8}")

    # Try JSON Lines with cp1252
    try:
        with open(path, "r", encoding="cp1252", errors="replace") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                items.append(json.loads(line))
        if items:
            print("Loaded as cp1252 JSONL")
            return items
    except Exception as e_jl_cp:
        print(f"cp1252 JSONL failed: {e_jl_cp}")

    raise RuntimeError(f"Could not parse JSON at {path} with UTF-8/cp1252 or JSONL.")

class PromptOnlyDataset(Dataset):
    def __init__(self, prompts, tokenizer, max_length=128):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.prompts)
    def __getitem__(self, idx):
        text = self.prompts[idx]
        enc = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in enc.items()}

def write_targets_for_orders_json(json_path, output_path, batch_size=64):
    data = load_json_robust(json_path)

    # Support dict-wrapped formats
    if isinstance(data, dict):
        for k in ("items", "data", "orders"):
            if k in data and isinstance(data[k], list):
                data = data[k]
                break

    prompts = [d["prompt"] for d in data if isinstance(d, dict) and "prompt" in d]
    print(f"Loaded {len(prompts)} prompts from {json_path}")

    ds = PromptOnlyDataset(prompts, tokenizer, max_length=128)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False)

    model_target.eval()
    pred_labels = []
    with torch.no_grad():
        for batch in dl:
            inputs = {k: v.to(device) for k, v in batch.items()}
            logits = model_target(**inputs).logits
            pred_idx = torch.argmax(logits, dim=1).cpu().tolist()
            pred_labels.extend([idx2label[i] for i in pred_idx])

    with open(output_path, "w", encoding="utf-8") as f:
        for lbl in pred_labels:
            f.write(f"{lbl}\n")

    print(f"Wrote {len(pred_labels)} targets to {output_path}")

# Run once to create the output file
write_targets_for_orders_json(input_json, output_txt, batch_size=64)
# -------------------------------
# Interactive predict loop (uses reloaded model)
# -------------------------------
print("\n=== HQ Order Prediction (type 'exit' to quit) ===\n")
while True:
    user_input = input("Enter HQ order prompt: ")
    if user_input.lower().strip() == "exit":
        print("Exiting...")
        break
    enc = tokenizer(
        user_input, truncation=True, padding="max_length",
        max_length=MAX_LEN, return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        logits = inference_model(**enc).logits
        target_idx = int(torch.argmax(logits, dim=1).item())
    print(f"Predicted Target: {idx2label[target_idx]}\n")


Using device: cuda
POKEMON list order: ['Bulbasaur', 'Charizard', 'Pikachu', 'Mewtwo']
Manual label2idx mapping: {'Bulbasaur': 0, 'Charizard': 1, 'Pikachu': 2, 'Mewtwo': 3}




Loading fine-tuned model from /content/saved_models_2_target
Class counts: {'Mewtwo': 420, 'Charizard': 532, 'Pikachu': 428, 'Bulbasaur': 420}
Class weights (capped): [4.285714285714286, 3.3834586466165413, 4.205607476635514, 4.285714285714286]


Validation metrics: {'eval_loss': 1.0344481468200684, 'eval_model_preparation_time': 0.0053, 'eval_accuracy': 1.0, 'eval_runtime': 3.554, 'eval_samples_per_second': 63.309, 'eval_steps_per_second': 2.251}
Test metrics: {'eval_loss': 1.0343306064605713, 'eval_model_preparation_time': 0.0053, 'eval_accuracy': 1.0, 'eval_runtime': 3.4677, 'eval_samples_per_second': 64.884, 'eval_steps_per_second': 2.307}

Reloading best checkpoint for inference...
UTF-8 load failed: 'utf-8' codec can't decode byte 0x92 in position 13990: invalid start byte
Loaded 200 prompts from /content/test_prompts_orders.json
Wrote 200 targets to /content/saved_models_2

=== HQ Order Prediction (type 'exit' to quit) ===



KeyboardInterrupt: Interrupted by user

In [None]:
import os
import json
import torch
from torch.utils.data import Dataset, DataLoader

# Windows paths
input_json = "/content/test_prompts_orders.json"
output_txt = "/content/saved_models"

def load_json_robust(path):
    # Try UTF-8 first (with BOM strip)
    try:
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        if text and text[0] == "\ufeff":
            text = text.lstrip("\ufeff")
        return json.loads(text)
    except Exception as e_utf8:
        print(f"UTF-8 load failed: {e_utf8}")

    # Try cp1252 (Windows-1252)
    try:
        with open(path, "r", encoding="cp1252", errors="strict") as f:
            text = f.read()
        if text and text[0] == "\ufeff":
            text = text.lstrip("\ufeff")
        return json.loads(text)
    except Exception as e_cp:
        print(f"cp1252 load failed: {e_cp}")

    # Try JSON Lines with UTF-8
    items = []
    try:
        with open(path, "r", encoding="utf-8", errors="replace") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                items.append(json.loads(line))
        if items:
            print("Loaded as UTF-8 JSONL")
            return items
    except Exception as e_jl_utf8:
        print(f"UTF-8 JSONL failed: {e_jl_utf8}")

    # Try JSON Lines with cp1252
    try:
        with open(path, "r", encoding="cp1252", errors="replace") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                items.append(json.loads(line))
        if items:
            print("Loaded as cp1252 JSONL")
            return items
    except Exception as e_jl_cp:
        print(f"cp1252 JSONL failed: {e_jl_cp}")

    raise RuntimeError(f"Could not parse JSON at {path} with UTF-8/cp1252 or JSONL.")

class PromptOnlyDataset(Dataset):
    def __init__(self, prompts, tokenizer, max_length=128):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.prompts)
    def __getitem__(self, idx):
        text = self.prompts[idx]
        enc = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in enc.items()}

def write_targets_for_orders_json(json_path, output_path, batch_size=64):
    data = load_json_robust(json_path)

    # Support dict-wrapped formats
    if isinstance(data, dict):
        for k in ("items", "data", "orders"):
            if k in data and isinstance(data[k], list):
                data = data[k]
                break

    prompts = [d["prompt"] for d in data if isinstance(d, dict) and "prompt" in d]
    print(f"Loaded {len(prompts)} prompts from {json_path}")

    ds = PromptOnlyDataset(prompts, tokenizer, max_length=128)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False)

    model_target.eval()
    pred_labels = []
    with torch.no_grad():
        for batch in dl:
            inputs = {k: v.to(device) for k, v in batch.items()}
            logits = model_target(**inputs).logits
            pred_idx = torch.argmax(logits, dim=1).cpu().tolist()
            pred_labels.extend([idx2label[i] for i in pred_idx])

    with open(output_path, "w", encoding="utf-8") as f:
        for lbl in pred_labels:
            f.write(f"{lbl}\n")

    print(f"Wrote {len(pred_labels)} targets to {output_path}")

# Run once to create the output file
write_targets_for_orders_json(input_json, output_txt, batch_size=64)