# **Cell 1 – Install + imports**

In [1]:
!pip install -q sentence-transformers transformers datasets accelerate evaluate

import os
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sentence_transformers import CrossEncoder, InputExample
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"   # stop wandb interactive prompts
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

# **Cell 2 – Load raw data + basic cleaning (re-use your logic)**

In [5]:
# === Upload data manually from local PC ===

from google.colab import files
import pandas as pd
import ast

print("📤 Please upload train.csv from your computer...")
uploaded = files.upload()  # This will open a file picker

# Retrieve the filename you uploaded
train_filename = list(uploaded.keys())[0]
print("📄 Uploaded file:", train_filename)



📤 Please upload train.csv from your computer...


Saving train.csv to train.csv
📄 Uploaded file: train.csv


# **Cell 4 – Train/val split for advanced models**

In [6]:
# === Cell: Load + clean + build pair_df + truncate + split (CORRECT ORDER) ===
import pandas as pd
import ast
import re
from sklearn.model_selection import train_test_split

df = pd.read_csv(train_filename)
print("Raw shape:", df.shape)

def clean_text_field(x):
    if isinstance(x, list):
        return " ".join([str(t) for t in x])
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return " ".join([str(t) for t in parsed])
            except Exception:
                return s
        return s
    return str(x)

def remove_surrogates(text):
    if not isinstance(text, str):
        text = str(text)
    return re.sub(r'[\ud800-\udfff]', '', text)

# Clean columns
for col in ["prompt", "response_a", "response_b"]:
    df[col] = df[col].fillna("").apply(clean_text_field).apply(remove_surrogates)

# Keep only non-tie labeled rows
def compute_winner(row):
    if row.get("winner_model_a", 0) == 1: return 1
    if row.get("winner_model_b", 0) == 1: return 0
    if row.get("winner_tie", 0) == 1:     return -1
    return -1

df["winner"] = df.apply(compute_winner, axis=1)
df = df[df["winner"] != -1].copy()
print("After removing ties:", df.shape)

# Build pair_df
rows = []
for _, r in df.iterrows():
    rows.append({"prompt": r["prompt"], "response": r["response_a"], "label": 1 if r["winner_model_a"] == 1 else 0})
    rows.append({"prompt": r["prompt"], "response": r["response_b"], "label": 1 if r["winner_model_b"] == 1 else 0})

pair_df = pd.DataFrame(rows)
pair_df["prompt"] = pair_df["prompt"].astype(str).apply(remove_surrogates)
pair_df["response"] = pair_df["response"].astype(str).apply(remove_surrogates)

# ✅ Truncate BEFORE split
MAX_CHARS_PROMPT = 600
MAX_CHARS_RESPONSE = 1200
pair_df["prompt"] = pair_df["prompt"].str.slice(0, MAX_CHARS_PROMPT)
pair_df["response"] = pair_df["response"].str.slice(0, MAX_CHARS_RESPONSE)

print("Pairwise dataset:", pair_df.shape)
print("Label distribution:\n", pair_df["label"].value_counts(normalize=True))

# ✅ Split AFTER truncation
train_df, val_df = train_test_split(
    pair_df,
    test_size=0.2,
    random_state=42,
    stratify=pair_df["label"]
)

print("Train:", train_df.shape, "| Val:", val_df.shape)


Raw shape: (57477, 9)


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m


After removing ties: (39716, 10)
Pairwise dataset: (79432, 3)
Label distribution:
 label
1    0.5
0    0.5
Name: proportion, dtype: float64
Train: (63545, 3) | Val: (15887, 3)


In [7]:
import os
os.makedirs("data/processed", exist_ok=True)
pair_df.to_csv("data/processed/pairwise_train.csv", index=False)
print("✅ Saved:", "data/processed/pairwise_train.csv")


✅ Saved: data/processed/pairwise_train.csv


# **Part 1 – Cross-Encoder (SentenceTransformers)**
# **Cell 5 – Prepare training samples**

In [10]:
train_samples = [
    InputExample(
        texts=[row.prompt, row.response],
        label=float(row.label)
    )
    for row in train_df.itertuples()
]

val_samples = [
    InputExample(
        texts=[row.prompt, row.response],
        label=float(row.label)
    )
    for row in val_df.itertuples()
]

len(train_samples), len(val_samples)


(63545, 15887)

# **Cell 6 – Define & train Cross-Encoder**

In [11]:
# **Cell 6 – Define & train Cross-Encoder (compatible with your version)**

import math
from torch.utils.data import DataLoader

cross_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
cross_encoder = CrossEncoder(cross_model_name, num_labels=1)

BATCH_SIZE = 16
EPOCHS = 2

# Create DataLoader (collate_fn handles smart batching)
train_dataloader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=cross_encoder.smart_batching_collate
)

# Warmup: ~10% of total steps
total_steps = math.ceil(len(train_samples) / BATCH_SIZE) * EPOCHS
warmup_steps = int(0.1 * total_steps)

cross_encoder.fit(
    train_dataloader=train_dataloader,
    evaluator=None,              # we evaluate manually in Cell 7
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    show_progress_bar=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss
500,1.3348
1000,0.7023
1500,0.6916
2000,0.6839
2500,0.6812
3000,0.6813
3500,0.681
4000,0.6774
4500,0.6709
5000,0.673


# **Cell 7 – Evaluate Cross-Encoder**

In [12]:
# Get scores on validation set
val_texts = [(r.prompt, r.response) for r in val_df.itertuples()]
val_labels = val_df["label"].values

val_scores = cross_encoder.predict(val_texts)
val_preds = (val_scores >= 0.5).astype(int)

acc = accuracy_score(val_labels, val_preds)
f1 = f1_score(val_labels, val_preds)
try:
    roc = roc_auc_score(val_labels, val_scores)
except Exception:
    roc = np.nan

print(f"Cross-Encoder - Acc: {acc:.4f} | F1: {f1:.4f} | ROC-AUC: {roc:.4f}")


Cross-Encoder - Acc: 0.5307 | F1: 0.2409 | ROC-AUC: 0.6025


In [13]:
CROSS_MODEL_DIR = os.path.join(os.getcwd(), "processors", "models", "cross_encoder")
os.makedirs(CROSS_MODEL_DIR, exist_ok=True)
cross_encoder.save(CROSS_MODEL_DIR)
print("✅ Cross-encoder saved to", CROSS_MODEL_DIR)


✅ Cross-encoder saved to /content/processors/models/cross_encoder


# **Part 2 – Transformer Fine-Tuning (BERT / DeBERTa)**
# **Cell 8 – Build a HuggingFace Dataset**

In [14]:
def build_text(row):
    return (
        "PROMPT: " + row["prompt"] +
        " RESPONSE: " + row["response"]
    )

train_texts = train_df.apply(build_text, axis=1).tolist()
val_texts = val_df.apply(build_text, axis=1).tolist()
train_labels = train_df["label"].tolist()
val_labels = val_df["label"].tolist()

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

train_dataset, val_dataset


(Dataset({
     features: ['text', 'label'],
     num_rows: 63545
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 15887
 }))

# **Cell 9 – Tokenizer + model**

In [15]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Cell 10 – Tokenization function + map**

In [16]:
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

train_tokenized = train_dataset.map(tokenize_fn, batched=True)
val_tokenized = val_dataset.map(tokenize_fn, batched=True)

# HuggingFace Trainer expects these columns:
train_tokenized = train_tokenized.remove_columns(["text"])
val_tokenized = val_tokenized.remove_columns(["text"])

train_tokenized.set_format("torch")
val_tokenized.set_format("torch")


Map:   0%|          | 0/63545 [00:00<?, ? examples/s]

Map:   0%|          | 0/15887 [00:00<?, ? examples/s]

# **Cell 11 – Metrics function**

In [17]:
import evaluate
metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = metric_acc.compute(predictions=preds, references=labels)["accuracy"]
    f1 = metric_f1.compute(predictions=preds, references=labels)["f1"]
    return {"accuracy": acc, "f1": f1}


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

# **Cell 12 – TrainingArguments + Trainer**

In [18]:
output_dir = "./transformer_llm_pref"

training_args = TrainingArguments(
    output_dir=output_dir,

    # ⚠️ OLD transformers compatibility
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,

    load_best_model_at_end=True,
    metric_for_best_model="f1",

    logging_steps=50,
    report_to="none",   # ⛔ disable wandb prompts
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6733,0.675202,0.562535,0.608759
2,0.6764,0.676878,0.567004,0.601933


TrainOutput(global_step=15888, training_loss=0.6766552243227925, metrics={'train_runtime': 3127.2833, 'train_samples_per_second': 40.639, 'train_steps_per_second': 5.08, 'total_flos': 8417640847595520.0, 'train_loss': 0.6766552243227925, 'epoch': 2.0})

# **Cell 13 – Evaluation & save**

In [19]:
metrics = trainer.evaluate()
print(metrics)

TRANSFORMER_DIR = os.path.join(os.getcwd(), "processors", "models", "transformer_classifier")
trainer.save_model(TRANSFORMER_DIR)
tokenizer.save_pretrained(TRANSFORMER_DIR)
print("✅ Transformer classifier saved to", TRANSFORMER_DIR)


{'eval_loss': 0.6752023100852966, 'eval_accuracy': 0.5625354063070435, 'eval_f1': 0.6087592884485477, 'eval_runtime': 105.982, 'eval_samples_per_second': 149.903, 'eval_steps_per_second': 9.37, 'epoch': 2.0}
✅ Transformer classifier saved to /content/processors/models/transformer_classifier


In [20]:
!zip -r llm_mlops_ready.zip data/processed processors/models notebooks -x "*ipynb_checkpoints*"
from google.colab import files
files.download("llm_mlops_ready.zip")

  adding: data/processed/ (stored 0%)
  adding: data/processed/pairwise_train.csv (deflated 67%)
  adding: processors/models/ (stored 0%)
  adding: processors/models/transformer_classifier/ (stored 0%)
  adding: processors/models/transformer_classifier/training_args.bin (deflated 53%)
  adding: processors/models/transformer_classifier/tokenizer.json (deflated 71%)
  adding: processors/models/transformer_classifier/vocab.txt (deflated 53%)
  adding: processors/models/transformer_classifier/special_tokens_map.json (deflated 42%)
  adding: processors/models/transformer_classifier/config.json (deflated 45%)
  adding: processors/models/transformer_classifier/model.safetensors (deflated 8%)
  adding: processors/models/transformer_classifier/tokenizer_config.json (deflated 75%)
  adding: processors/models/cross_encoder/ (stored 0%)
  adding: processors/models/cross_encoder/README.md (deflated 76%)
  adding: processors/models/cross_encoder/tokenizer.json (deflated 71%)
  adding: processors/mod

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>