In [1]:
# Set this to True when you want to train, False when you just want to submit
TRAIN_MODEL = False

In [2]:
import json
import re
import pandas as pd
import numpy as np
import torch
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from scipy.sparse import csr_matrix, hstack
from scipy.special import softmax

2026-01-27 21:19:19.569755: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769548759.742183      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769548759.791492      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769548760.199705      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769548760.199741      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769548760.199743      24 computation_placer.cc:177] computation placer alr

**JSON & Unicode:** json.loads() is the "magic bullet" here. It removes the [" "] brackets and automatically converts \u0411 into the correct Russian/Arabic/Chinese and other language characters in one step.

**Code Preservation:** By using isprintable() and only collapsing excessive newlines, we ensure that C, Rust, and Python code blocks keep their indentation and logic.

**HTML Safety:** No "strip HTML" step here. Because, if the prompt is about HTML, we want to keep those tags. If thre are html tags that aren't part of the content, we can add a specific rule for that.

**Target Simplification:** Converting the three winner_ columns into a single target (0, 1, 2) makes it much easier to use with DeBERTa or XGBoost later.

In [3]:
# 1. Load the data
df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')

def clean_text(text):
    if pd.isna(text):
        return ""
    
    # Step A: Handle the [" "] wrapping (JSON format)
    try:
        # json.loads automatically handles \uXXXX unicode decoding
        data = json.loads(text)
        if isinstance(data, list):
            text = " ".join(data) # Join conversation turns with a space
        else:
            text = str(data)
    except Exception:
        # Fallback if it's not valid JSON
        pass

    # Step B: Remove non-printable control characters (keep newlines and tabs)
    text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\t")

    # Step C: Standardize Whitespace
    # Convert tabs to 4 spaces (common in code)
    text = text.replace("\t", "    ")
    # Collapse 3+ newlines into 2 to keep structure but remove "empty space" noise
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text.strip()

# 2. Apply cleaning to the main columns
print("Cleaning text columns... this may take a minute.")
df['prompt'] = df['prompt'].apply(clean_text)
df['response_a'] = df['response_a'].apply(clean_text)
df['response_b'] = df['response_b'].apply(clean_text)

# 3. Create the "Key Feature": Length Difference
# We use character length as a proxy for verbosity
df['len_a'] = df['response_a'].str.len()
df['len_b'] = df['response_b'].str.len()
df['len_diff'] = df['len_a'] - df['len_b']

# 4. Create a single Target column for easier modeling
# 0: Model A wins, 1: Model B wins, 2: Tie
def determine_winner(row):
    if row['winner_model_a'] == 1: return 0
    if row['winner_model_b'] == 1: return 1
    return 2

df['target'] = df.apply(determine_winner, axis=1)

# 5. Quick EDA on the "Verbosity Bias"
print("\n--- Verbosity Bias Analysis ---")
a_wins_longer = df[(df['target'] == 0) & (df['len_a'] > df['len_b'])].shape[0]
a_wins_total = df[df['target'] == 0].shape[0]
print(f"When Model A wins, it was longer {100 * a_wins_longer / a_wins_total:.2f}% of the time.")

b_wins_longer = df[(df['target'] == 1) & (df['len_b'] > df['len_a'])].shape[0]
b_wins_total = df[df['target'] == 1].shape[0]
print(f"When Model B wins, it was longer {100 * b_wins_longer / b_wins_total:.2f}% of the time.")

# 6. Preview the cleaned data
print("\n--- Cleaned Data Preview ---")
print(df[['prompt', 'response_a', 'len_diff', 'target']].head())

Cleaning text columns... this may take a minute.

--- Verbosity Bias Analysis ---
When Model A wins, it was longer 61.34% of the time.
When Model B wins, it was longer 61.59% of the time.

--- Cleaned Data Preview ---
                                              prompt  \
0  Is it morally right to try to have a certain p...   
1  What is the difference between marriage licens...   
2  explain function calling. how would you call a...   
3  How can I create a test set for a very rare ca...   
4  What is the best way to travel from Tel-Aviv t...   

                                          response_a  len_diff  target  
0  The question of whether it is morally right to...      3259       0  
1  A marriage license is a legal document that al...      -496       1  
2  Function calling is the process of invoking or...      -906       2  
3  Creating a test set for a very rare category c...      1615       0  
4  The best way to travel from Tel Aviv to Jerusa...       530       1  


The above results are very revealing! A 61%+ win rate for the longer response is a massive signal in machine learning. It confirms that "Verbosity Bias" is a dominant factor in this dataset.

Here is how to interpret the results and what we should do next to find the "Positional Bias" and start modeling.

**1. Interpreting your "Verbosity Bias"**

In a perfectly unbiased world, the longer response should win about 50% of the time. The fact that it wins 61% of the time means:

Humans (the judges) are significantly swayed by detail and length.

**Your Strategy:** Any model we build must include response length as a feature. Even a simple model that always picks the longer response would likely beat a random guess.

**2. How to check for "Positional Bias"**
Positional bias is the tendency to pick "Response A" just because it is the first one the human reads. To check for this, we look at the overall win rates for A vs. B.

In [4]:
# Calculate overall win rates to check positional bias
total_rows = len(df)
a_wins = df[df['target'] == 0].shape[0]
b_wins = df[df['target'] == 1].shape[0]
ties = df[df['target'] == 2].shape[0]

print(f"Model A Win Rate: {100 * a_wins / total_rows:.2f}%")
print(f"Model B Win Rate: {100 * b_wins / total_rows:.2f}%")
print(f"Tie Rate: {100 * ties / total_rows:.2f}%")

Model A Win Rate: 34.91%
Model B Win Rate: 34.19%
Tie Rate: 30.90%


Since the Win Rates are roughly **equal** (e.g., ~34% vs !34%), then positional bias is low or non-existent in this specific dataset.

Now that our data is clean and we understand the biases, it’s time to build our Baseline Model.

We will use Logistic Regression (a very simple model) using only 3 features: len_a, len_b, and len_diff. This will tell us: "How much of this competition can be solved just by looking at the length of the text?"

In [5]:
# Step 1: Prepare the Features

# Select simple features
X = df[['len_a', 'len_b', 'len_diff']]
y = df['target']

# Split into Training (80%) and Validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Step 2: Train the Baseline

# Initialize and train
model = LogisticRegression(multi_class='multinomial')
model.fit(X_train, y_train)

# Predict probabilities (Required for Log Loss)
probs = model.predict_proba(X_val)

# Calculate Log Loss
score = log_loss(y_val, probs)
print(f"Baseline Log Loss (Length Only): {score:.4f}")



Baseline Log Loss (Length Only): 1.0695


**Why to train the Baseline?**

**The "Floor":** This Log Loss score is our "Floor." Any fancy LLM (like DeBERTa) we build later must get a lower score than this. If it doesn't, our LLM isn't actually "reading" the text; it's just guessing.

**Feature Importance:** If this simple model gets a surprisingly good score (e.g., a Log Loss of 1.0 or lower), we know that the competition is heavily decided by formatting and length rather than deep logic.

A score of 1.0695 is officially better than random guessing. This confirms that even without reading a single word, our model has learned to use "length" to make better-than-random predictions.

Here is the explanation of that "magic number" and how the math works.

**What does "1.098" mean? (The Guessing Baseline)**

In this competition, we have 3 possible outcomes: Model A wins, Model B wins, or it's a Tie.

If you had no model at all and just guessed a 33.3% probability for each class for every single row, our Log Loss would be exactly 1.0986.

**Conclusion:** Our model is "smarter" than a random guesser. It has shifted its probabilities slightly toward the correct answers based on the length of the responses.

Moving from "length" to TF-IDF is a major jump. This is the first time our model will actually "read" the vocabulary used by the chatbots.

**What is TF-IDF doing here?**

TF-IDF (Term Frequency-Inverse Document Frequency) identifies which words are unique or important to a specific response. For example:

If Response A uses words like "accurate," "detailed," and "verified," while Response B uses "sorry," "cannot," and "error," TF-IDF will give those words high scores.
The model will then learn that "sorry" is a strong signal for losing, while "detailed" is a signal for winning.

**The Strategy: The "Vector Difference" Trick**

Since we are comparing two responses, we don't just want to know what words are in A. We want to know what words are in A that are NOT in B.

We turn Response A into a vector of numbers.
We turn Response B into a vector of numbers.
We subtract them: X = Vector_A - Vector_B.
This "Difference Vector" tells the model exactly what the content gap is between the two bots.

**TF-IDF Implementation Code Below:**

In [7]:
# 1. Prepare the text
# We combine prompt and responses to let the Vectorizer see all possible words
all_text = pd.concat([df['prompt'], df['response_a'], df['response_b']])

# 2. Initialize TF-IDF
# max_features=5000 keeps the model fast and prevents memory errors
# ngram_range=(1,2) lets the model see phrases like "I am sorry" or "very helpful"
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
tfidf.fit(all_text)

# 3. Transform the columns
X_a = tfidf.transform(df['response_a'])
X_b = tfidf.transform(df['response_b'])

# 4. Create the "Difference Vector" (A minus B)
# This captures what makes A different from B
X_diff = X_a - X_b

# 5. Combine with our previous "Length" features
# hstack combines the text math with our previous length numbers
X_final = hstack([X_diff, csr_matrix(df[['len_a', 'len_b', 'len_diff']].values)])

# 6. Train and Evaluate
X_train, X_val, y_train, y_val = train_test_split(X_final, df['target'], test_size=0.2, random_state=42)

model_tfidf = LogisticRegression(max_iter=1000) 
model_tfidf.fit(X_train, y_train)

# Predict and score
probs_tfidf = model_tfidf.predict_proba(X_val)
new_score = log_loss(y_val, probs_tfidf)

print(f"New TF-IDF Log Loss: {new_score:.4f}")
print(f"Improvement over Length Baseline: {score - new_score:.4f}")

New TF-IDF Log Loss: 1.0889
Improvement over Length Baseline: -0.0194


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Score actually got worse (higher Log Loss) when I added the 5,000 TF-IDF features.


**Why did the score get worse?**

**Noise vs. Signal:** By adding 5,000 word-based columns, I introduced a lot of "noise." Logistic Regression tried to find patterns in those words, but because it's a linear model, it likely got "confused" by rare words that appeared in the training set but didn't actually help predict a winner.
**The "Length" Signal is Overwhelmed:** The length features (which we know are strong) represent only 3 out of 5,003 columns. The model might be giving too much weight to the 5,000 weak word features and "forgetting" the importance of the 3 strong length features.

**The Solution: Tier 2 - Gradient Boosting (XGBoost)**

It is time to bring in the heavy hitter. XGBoost is much better than Logistic Regression for this because:


**Feature Selection:** XGBoost automatically ignores features that don't help. It will likely ignore 4,900 of those TF-IDF words and focus on the ones that actually matter.
**Non-Linearity:** It can understand that "Length matters, but only if the response also contains certain keywords."

We will also add "Human-Centric Features" to help the model understand quality.

In [8]:
# --- STEP 1: Feature Engineering ---
df['word_count_a'] = df['response_a'].apply(lambda x: len(x.split()))
df['word_count_b'] = df['response_b'].apply(lambda x: len(x.split()))
df['unique_words_a'] = df['response_a'].apply(lambda x: len(set(x.split())))
df['unique_words_b'] = df['response_b'].apply(lambda x: len(set(x.split())))

df['has_list_a'] = df['response_a'].str.contains(r'\*|\d\.').astype(int)
df['has_list_b'] = df['response_b'].str.contains(r'\*|\d\.').astype(int)

meta_features = [
    'len_a', 'len_b', 'len_diff', 
    'word_count_a', 'word_count_b', 
    'unique_words_a', 'unique_words_b',
    'has_list_a', 'has_list_b'
]

# --- STEP 2: Prepare Feature Matrix ---
# Glue TF-IDF (X_diff) and Meta Features together
X_final = hstack([X_diff, csr_matrix(df[meta_features].values)])

# --- STEP 3: RE-CREATE ALIGNED SPLIT ---
# 1. Add a manual ID to the dataframe to act as a breadcrumb
df['orig_id'] = np.arange(len(df))

# Rename column for Trainer compatibility
if 'target' in df.columns:
    df = df.rename(columns={'target': 'labels'})

# 2. Re-create the HF Dataset and split with the EXACT same seed (42)
# This ensures the rows chosen for 'test' match your DeBERTa training run
dataset = Dataset.from_pandas(df[['prompt', 'response_a', 'response_b', 'labels', 'orig_id']])
train_val = dataset.train_test_split(test_size=0.2, seed=42)

# 3. Extract the indices that the split chose
train_indices = train_val["train"]['orig_id']
val_indices = train_val["test"]['orig_id']

# 4. Slice the feature matrix and labels using these indices
X_train_aligned = X_final.tocsr()[train_indices]
X_val_aligned = X_final.tocsr()[val_indices]

y_train_aligned = df.loc[train_indices, 'labels']
y_val_aligned = df.loc[val_indices, 'labels']

print(f"Data Aligned: {X_train_aligned.shape[0]} training samples, {X_val_aligned.shape[0]} validation samples.")

# --- STEP 4: Initialize and Train XGBoost ---
model_xgb = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    objective='multi:softprob',
    num_class=3,
    tree_method='hist', 
    device='cuda', # Fast training on GPU
    random_state=42
)

print("Training Aligned XGBoost...")
model_xgb.fit(X_train_aligned, y_train_aligned)

# --- STEP 5: Predict and Score ---
probs_xgb = model_xgb.predict_proba(X_val_aligned)
xgb_score = log_loss(y_val_aligned, probs_xgb)

print(f"Aligned XGBoost Log Loss: {xgb_score:.4f}")

# --- STEP 6: Final Alignment Check ---
# Verification: Do the labels in our XGBoost validation set match the DeBERTa validation set?
deberta_labels = train_val["test"]['labels']
if np.array_equal(y_val_aligned.values, deberta_labels):
    print("✅ SUCCESS: XGBoost and DeBERTa are perfectly aligned. You can now blend.")
else:
    print("❌ ERROR: Alignment failed. Check if the seed or dataframe was changed.")

Data Aligned: 45981 training samples, 11496 validation samples.
Training Aligned XGBoost...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Aligned XGBoost Log Loss: 1.0380
✅ SUCCESS: XGBoost and DeBERTa are perfectly aligned. You can now blend.


**The Next Level: Tier 3 - DeBERTa-v3 (The Gold Standard)**
To get the score below 1.00 (and eventually toward 0.90 or lower), we need a model that actually understands language. This is where DeBERTa-v3 comes in.

**What makes DeBERTa-v3 different?**
Unlike XGBoost, which looks at a table of numbers, DeBERTa "reads" the text using Self-Attention. It looks at every word in the context of every other word.

**The "Cross-Encoder" Strategy**
In this competition, the most powerful way to use DeBERTa is as a Cross-Encoder. Instead of looking at Response A and Response B separately, we feed them to the model together so it can compare them side-by-side.

**We format the input like this:**
[CLS] Prompt [SEP] Response A [SEP] Response B [SEP]

The model then uses its attention mechanism to "look back and forth" between A and B to decide which one answers the Prompt better.

**How to implement this (High-Level)**
Since DeBERTa is a Deep Learning model, the code is different from Scikit-Learn. We will use the transformers library.

**The Plan:**
**Tokenization:** Convert text into "tokens" (numbers that represent word pieces).
**Model Loading:** Load a pre-trained microsoft/deberta-v3-small or base.
**Fine-Tuning:** Train the model for 1 or 2 "epochs" (passes through the data).

*Note:* This will require switching Kaggle Notebook's "Accelerator" to GPU T4 x2 or P100, as training this on a CPU is impossible.

We’ve made a significant breakthrough. Moving from baseline score of 1.0588 to 1.0301 is a huge jump.

**We have successfully proven two things:**
1. **XGBoost is superior** to Logistic Regression here because it can handle the 5,000 TF-IDF features without getting "confused" by the noise.
2. **Formatting matters:** By adding the "has_list" and "word_count" features, you gave the model a way to judge "quality" that simple character counting couldn't capture.

Why are we still at 1.03? (The "Glass Ceiling")

Even though 1.03 is good, it's likely still far from the top of the leaderboard. Here is why:

**TF-IDF is "Blind" to Logic:** TF-IDF knows that the word "not" and "good" are present, but it doesn't really understand the difference between "This is good, not bad" and "This is bad, not good."
**No Semantic Understanding:** If one bot is polite and the other is rude, or if one bot gives a factually wrong answer, TF-IDF can't easily detect that.

In [9]:
# ==========================================
# 1. CONFIGURATION
# ==========================================
MODEL_NAME = "microsoft/deberta-v3-small"
CHECKPOINT_PATH = "/kaggle/input/k/prabhaharan/llm-classification-finetuning/results/checkpoint-4311"

# ==========================================
# 2. DATA PREPARATION (Common to both modes)
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = [f"{p} [SEP] {a} [SEP] {b}" for p, a, b in zip(examples['prompt'], examples['response_a'], examples['response_b'])]
    return tokenizer(inputs, truncation=True, padding='max_length', max_length=1024)

# Tokenize and Split
tokenized_ds = dataset.map(preprocess_function, batched=True)
train_val = tokenized_ds.train_test_split(test_size=0.2, seed=42)

# ==========================================
# 3. MODEL & TRAINER SETUP
# ==========================================
if TRAIN_MODEL:
    print("--- MODE: TRAINING ---")
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
    
    args = TrainingArguments(
        output_dir="./results",
        learning_rate=1e-5,      
        per_device_train_batch_size=4, 
        per_device_eval_batch_size=4,
        num_train_epochs=3,      
        weight_decay=0.01,
        eval_strategy="epoch",    
        save_strategy="epoch",
        load_best_model_at_end=True,
        fp16=True,               
        gradient_accumulation_steps=4, 
        report_to="none"
    )
else:
    print("--- MODE: INFERENCE (Loading Checkpoint) ---")
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_PATH)
    
    args = TrainingArguments(
        output_dir="./temp",
        per_device_eval_batch_size=32, # Faster prediction
        fp16=True,
        report_to="none"
    )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    tokenizer=tokenizer,
)

# ==========================================
# 4. EXECUTION
# ==========================================
if TRAIN_MODEL:
    print("Starting DeBERTa training...")
    trainer.train()
    trainer.save_model("./results/final_model")
    metrics = trainer.evaluate()
    print(f"DeBERTa Validation Log Loss: {metrics['eval_loss']:.4f}")
else:
    print("Skipping training. Generating predictions for blending...")

# Always generate predictions on the validation set for the blending step
deberta_output = trainer.predict(train_val["test"])
deberta_probs = softmax(deberta_output.predictions, axis=1)

print("DeBERTa predictions ready for blending!")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/57477 [00:00<?, ? examples/s]

--- MODE: INFERENCE (Loading Checkpoint) ---


  trainer = Trainer(


Skipping training. Generating predictions for blending...




DeBERTa predictions ready for blending!


In [10]:
# --- STEP 1: Tokenize the Aligned Validation Set ---
# The model needs 'input_ids' to make predictions
print("Tokenizing validation set for DeBERTa...")
def preprocess_function(examples):
    inputs = [f"{p} [SEP] {a} [SEP] {b}" for p, a, b in zip(examples['prompt'], examples['response_a'], examples['response_b'])]
    return tokenizer(inputs, truncation=True, padding='max_length', max_length=1024)

# Tokenize only the test (validation) portion
tokenized_val = train_val["test"].map(preprocess_function, batched=True)

# --- STEP 2: Get DeBERTa Probabilities ---
print("Generating DeBERTa predictions...")
deberta_output = trainer.predict(tokenized_val)
deberta_logits = deberta_output.predictions
deberta_probs = softmax(deberta_logits, axis=1)

# --- STEP 3: Get XGBoost Probabilities ---
# Use the ALIGNED validation features we created in the previous step
xgb_probs = model_xgb.predict_proba(X_val_aligned)

# --- STEP 4: Find the Best Blend ---
# We will test different weights to see which one is best
print("\n--- Searching for Optimal Blend ---")
best_score = 999
best_w = 0

for w in np.linspace(0, 1, 11): # Checks 0.0, 0.1, 0.2 ... 1.0
    blend = (w * xgb_probs) + ((1-w) * deberta_probs)
    score = log_loss(y_val_aligned, blend)
    print(f"XGB Weight {w:.1f}: Log Loss {score:.4f}")
    
    if score < best_score:
        best_score = score
        best_w = w

print(f"\n✅ Best Weight: {best_w:.1f} XGB / {1-best_w:.1f} DeBERTa")
print(f"✅ Final Blended Log Loss: {best_score:.4f}")

Tokenizing validation set for DeBERTa...


Map:   0%|          | 0/11496 [00:00<?, ? examples/s]

Generating DeBERTa predictions...



--- Searching for Optimal Blend ---
XGB Weight 0.0: Log Loss 1.0693
XGB Weight 0.1: Log Loss 1.0589
XGB Weight 0.2: Log Loss 1.0505
XGB Weight 0.3: Log Loss 1.0438
XGB Weight 0.4: Log Loss 1.0386
XGB Weight 0.5: Log Loss 1.0349
XGB Weight 0.6: Log Loss 1.0326
XGB Weight 0.7: Log Loss 1.0317
XGB Weight 0.8: Log Loss 1.0322
XGB Weight 0.9: Log Loss 1.0342
XGB Weight 1.0: Log Loss 1.0380

✅ Best Weight: 0.7 XGB / 0.3 DeBERTa
✅ Final Blended Log Loss: 1.0317




In [11]:
# 1. Load the Test Data
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

# 2. Apply the SAME cleaning function used for training
# (Assuming the 'clean_text' function we wrote earlier is still in memory)
print("Cleaning test data...")
test_df['prompt'] = test_df['prompt'].apply(clean_text)
test_df['response_a'] = test_df['response_a'].apply(clean_text)
test_df['response_b'] = test_df['response_b'].apply(clean_text)

# 3. Generate XGBoost Features for Test Set
test_df['len_a'] = test_df['response_a'].str.len()
test_df['len_b'] = test_df['response_b'].str.len()
test_df['len_diff'] = test_df['len_a'] - test_df['len_b']
test_df['word_count_a'] = test_df['response_a'].apply(lambda x: len(x.split()))
test_df['word_count_b'] = test_df['response_b'].apply(lambda x: len(x.split()))
test_df['unique_words_a'] = test_df['response_a'].apply(lambda x: len(set(x.split())))
test_df['unique_words_b'] = test_df['response_b'].apply(lambda x: len(set(x.split())))
test_df['has_list_a'] = test_df['response_a'].str.contains(r'\*|\d\.').astype(int)
test_df['has_list_b'] = test_df['response_b'].str.contains(r'\*|\d\.').astype(int)

# 4. Transform Text using the ALREADY FITTED TF-IDF
# Crucial: Use .transform(), NOT .fit_transform()!
X_test_a = tfidf.transform(test_df['response_a'])
X_test_b = tfidf.transform(test_df['response_b'])
X_test_diff = X_test_a - X_test_b

# Combine with meta features
X_test_final = hstack([X_test_diff, csr_matrix(test_df[meta_features].values)])

# 5. Get XGBoost Predictions
xgb_test_probs = model_xgb.predict_proba(X_test_final)

# 6. Get DeBERTa Predictions
# Convert to HF Dataset and Tokenize
test_ds = Dataset.from_pandas(test_df[['prompt', 'response_a', 'response_b']])
tokenized_test = test_ds.map(preprocess_function, batched=True)

print("Generating DeBERTa test predictions...")
deberta_test_output = trainer.predict(tokenized_test)
deberta_test_probs = softmax(deberta_test_output.predictions, axis=1)

# 7. Apply the Optimal Blend (0.7 XGB / 0.3 DeBERTa)
final_probs = (0.7 * xgb_test_probs) + (0.3 * deberta_test_probs)

# 8. Create Submission File
# The columns must be named exactly as specified in the competition
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': final_probs[:, 0],
    'winner_model_b': final_probs[:, 1],
    'winner_tie': final_probs[:, 2]
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved successfully!")

Cleaning test data...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Generating DeBERTa test predictions...


Submission file saved successfully!
