In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv
/kaggle/input/llm-classifier-finetuning-continued/__notebook__.ipynb
/kaggle/input/llm-classifier-finetuning-continued/__output__.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/spm.model
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/config.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/trainer_state.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/training_args.bin
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/tokenizer.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/tokenizer_config.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/scaler.pt
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint

In [2]:
# Set this to True when you want to train, False when you just want to submit
TRAIN_MODEL = True

In [3]:
import json
import re
import pandas as pd
import numpy as np
import torch
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from scipy.sparse import csr_matrix, hstack, vstack
from scipy.special import softmax

2026-02-06 05:05:19.188808: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770354319.654412      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770354319.803398      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770354320.751000      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770354320.751039      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770354320.751042      24 computation_placer.cc:177] computation placer alr

## Improvements Made in This Version:
1. **Data Augmentation (Swap Trick)** - Double training data by swapping A/B with flipped labels
2. **Better XGBoost Features** - Punctuation, URLs, LaTeX/Math, Capitalization ratio
3. **Upgraded DeBERTa** - Using deberta-v3-base instead of small
4. **Complete Submission Pipeline** - Ready for Kaggle submission

**JSON & Unicode:** json.loads() is the "magic bullet" here. It removes the [" "] brackets and automatically converts \u0411 into the correct Russian/Arabic/Chinese and other language characters in one step.

**Code Preservation:** By using isprintable() and only collapsing excessive newlines, we ensure that C, Rust, and Python code blocks keep their indentation and logic.

**HTML Safety:** No "strip HTML" step here. Because, if the prompt is about HTML, we want to keep those tags. If there are html tags that aren't part of the content, we can add a specific rule for that.

**Target Simplification:** Converting the three winner_ columns into a single target (0, 1, 2) makes it much easier to use with DeBERTa or XGBoost later.

In [4]:
# 1. Load the data
df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')

def clean_text(text):
    if pd.isna(text):
        return ""
    
    # Step A: Handle the [" "] wrapping (JSON format)
    try:
        data = json.loads(text)
        if isinstance(data, list):
            text = " ".join(data)
        else:
            text = str(data)
    except Exception:
        pass

    # Step B: Remove non-printable control characters (keep newlines and tabs)
    text = "".join(ch for ch in text if ch.isprintable() or ch in "\n\t")

    # Step C: Standardize Whitespace
    text = text.replace("\t", "    ")
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text.strip()

# 2. Apply cleaning to the main columns
print("Cleaning text columns... this may take a minute.")
df['prompt'] = df['prompt'].apply(clean_text)
df['response_a'] = df['response_a'].apply(clean_text)
df['response_b'] = df['response_b'].apply(clean_text)

# 3. Create the "Key Feature": Length Difference
df['len_a'] = df['response_a'].str.len()
df['len_b'] = df['response_b'].str.len()
df['len_diff'] = df['len_a'] - df['len_b']

# 4. Create a single Target column for easier modeling
def determine_winner(row):
    if row['winner_model_a'] == 1: return 0
    if row['winner_model_b'] == 1: return 1
    return 2

df['target'] = df.apply(determine_winner, axis=1)

# 5. Quick EDA on the "Verbosity Bias"
print("\n--- Verbosity Bias Analysis ---")
a_wins_longer = df[(df['target'] == 0) & (df['len_a'] > df['len_b'])].shape[0]
a_wins_total = df[df['target'] == 0].shape[0]
print(f"When Model A wins, it was longer {100 * a_wins_longer / a_wins_total:.2f}% of the time.")

b_wins_longer = df[(df['target'] == 1) & (df['len_b'] > df['len_a'])].shape[0]
b_wins_total = df[df['target'] == 1].shape[0]
print(f"When Model B wins, it was longer {100 * b_wins_longer / b_wins_total:.2f}% of the time.")

Cleaning text columns... this may take a minute.

--- Verbosity Bias Analysis ---
When Model A wins, it was longer 61.34% of the time.
When Model B wins, it was longer 61.59% of the time.


The above results are very revealing! A 61%+ win rate for the longer response is a massive signal in machine learning. It confirms that "Verbosity Bias" is a dominant factor in this dataset.

Here is how to interpret the results and what we should do next to find the "Positional Bias" and start modeling.

**1. Interpreting your "Verbosity Bias"**

In a perfectly unbiased world, the longer response should win about 50% of the time. The fact that it wins 61% of the time means:

Humans (the judges) are significantly swayed by detail and length.

**Your Strategy:** Any model we build must include response length as a feature. Even a simple model that always picks the longer response would likely beat a random guess.

**2. How to check for "Positional Bias"**
Positional bias is the tendency to pick "Response A" just because it is the first one the human reads. To check for this, we look at the overall win rates for A vs. B.

In [5]:
# Calculate overall win rates to check positional bias
total_rows = len(df)
a_wins = df[df['target'] == 0].shape[0]
b_wins = df[df['target'] == 1].shape[0]
ties = df[df['target'] == 2].shape[0]

print(f"Model A Win Rate: {100 * a_wins / total_rows:.2f}%")
print(f"Model B Win Rate: {100 * b_wins / total_rows:.2f}%")
print(f"Tie Rate: {100 * ties / total_rows:.2f}%")

Model A Win Rate: 34.91%
Model B Win Rate: 34.19%
Tie Rate: 30.90%


Since the Win Rates are roughly **equal** (e.g., ~34% vs ~34%), then positional bias is low or non-existent in this specific dataset.

Now that our data is clean and we understand the biases, it's time to build our models.

## STEP 1: Data Augmentation - The "Swap Trick"
This doubles the training data and forces the model to be position-invariant.

**Why it helps:**
- Doubles your training size for free
- Forces the model to ignore position
- Makes predictions more stable and symmetric

In [6]:
# ==========================================
# DATA AUGMENTATION: THE SWAP TRICK
# ==========================================
print("\n--- Applying Data Augmentation (Swap Trick) ---")

# Create a copy of the dataframe with swapped responses
df_swapped = df.copy()
df_swapped['response_a'] = df['response_b']
df_swapped['response_b'] = df['response_a']

# Swap the labels: 0 (A wins) -> 1 (B wins), 1 (B wins) -> 0 (A wins), 2 (Tie) stays the same
def swap_label(label):
    if label == 0: return 1
    if label == 1: return 0
    return 2

df_swapped['target'] = df_swapped['target'].apply(swap_label)

# Swap the length features as well
df_swapped['len_a'] = df['len_b']
df_swapped['len_b'] = df['len_a']
df_swapped['len_diff'] = -df['len_diff']

# Combine original and swapped data
df_augmented = pd.concat([df, df_swapped], ignore_index=True)
print(f"Original data size: {len(df)}")
print(f"Augmented data size: {len(df_augmented)} (2x)")


--- Applying Data Augmentation (Swap Trick) ---
Original data size: 57477
Augmented data size: 114954 (2x)


## STEP 2: Better Features for XGBoost
Adding punctuation counts, URL detection, LaTeX/Math detection, capitalization ratio

In [7]:
# ==========================================
# ENHANCED FEATURE ENGINEERING
# ==========================================
print("\n--- Engineering Enhanced Features ---")

def extract_features(df_input):
    """Extract all features for XGBoost - both original and new 'human-like' features"""
    df_feat = df_input.copy()
    
    # --- Original Features ---
    df_feat['len_a'] = df_feat['response_a'].str.len()
    df_feat['len_b'] = df_feat['response_b'].str.len()
    df_feat['len_diff'] = df_feat['len_a'] - df_feat['len_b']
    
    df_feat['word_count_a'] = df_feat['response_a'].apply(lambda x: len(str(x).split()))
    df_feat['word_count_b'] = df_feat['response_b'].apply(lambda x: len(str(x).split()))
    df_feat['word_diff'] = df_feat['word_count_a'] - df_feat['word_count_b']
    
    df_feat['unique_words_a'] = df_feat['response_a'].apply(lambda x: len(set(str(x).split())))
    df_feat['unique_words_b'] = df_feat['response_b'].apply(lambda x: len(set(str(x).split())))
    
    df_feat['has_list_a'] = df_feat['response_a'].str.contains(r'\*|\d\.', regex=True).astype(int)
    df_feat['has_list_b'] = df_feat['response_b'].str.contains(r'\*|\d\.', regex=True).astype(int)
    
    # --- NEW: Punctuation Count Features ---
    # Count exclamation marks, question marks, colons, semicolons
    df_feat['exclaim_a'] = df_feat['response_a'].str.count('!')
    df_feat['exclaim_b'] = df_feat['response_b'].str.count('!')
    df_feat['exclaim_diff'] = df_feat['exclaim_a'] - df_feat['exclaim_b']
    
    df_feat['question_a'] = df_feat['response_a'].str.count(r'\?')
    df_feat['question_b'] = df_feat['response_b'].str.count(r'\?')
    
    df_feat['colon_a'] = df_feat['response_a'].str.count(':')
    df_feat['colon_b'] = df_feat['response_b'].str.count(':')
    
    # --- NEW: URL/Link Detection ---
    # Humans love sources - check for http/https links
    url_pattern = r'https?://[^\s]+'
    df_feat['has_url_a'] = df_feat['response_a'].str.contains(url_pattern, regex=True).astype(int)
    df_feat['has_url_b'] = df_feat['response_b'].str.contains(url_pattern, regex=True).astype(int)
    df_feat['url_count_a'] = df_feat['response_a'].str.count(url_pattern)
    df_feat['url_count_b'] = df_feat['response_b'].str.count(url_pattern)
    
    # --- NEW: LaTeX/Math Detection ---
    # Check for $ signs (inline math) and $$ (display math)
    df_feat['has_latex_a'] = df_feat['response_a'].str.contains(r'\$', regex=True).astype(int)
    df_feat['has_latex_b'] = df_feat['response_b'].str.contains(r'\$', regex=True).astype(int)
    df_feat['latex_count_a'] = df_feat['response_a'].str.count(r'\$')
    df_feat['latex_count_b'] = df_feat['response_b'].str.count(r'\$')
    
    # --- NEW: Capitalization Ratio ---
    # High ratio might indicate "shouting" or emphasis
    def cap_ratio(text):
        text = str(text)
        if len(text) == 0:
            return 0
        upper_count = sum(1 for c in text if c.isupper())
        return upper_count / len(text)
    
    df_feat['cap_ratio_a'] = df_feat['response_a'].apply(cap_ratio)
    df_feat['cap_ratio_b'] = df_feat['response_b'].apply(cap_ratio)
    df_feat['cap_ratio_diff'] = df_feat['cap_ratio_a'] - df_feat['cap_ratio_b']
    
    # --- NEW: Code Block Detection ---
    # Check for code blocks (```)
    df_feat['has_code_a'] = df_feat['response_a'].str.contains(r'```', regex=True).astype(int)
    df_feat['has_code_b'] = df_feat['response_b'].str.contains(r'```', regex=True).astype(int)
    df_feat['code_blocks_a'] = df_feat['response_a'].str.count(r'```')
    df_feat['code_blocks_b'] = df_feat['response_b'].str.count(r'```')
    
    # --- NEW: Sentence Count (Proxy for Structure) ---
    df_feat['sentence_count_a'] = df_feat['response_a'].str.count(r'[.!?]+')
    df_feat['sentence_count_b'] = df_feat['response_b'].str.count(r'[.!?]+')
    
    # --- NEW: Newline Count (Formatting/Readability) ---
    df_feat['newline_a'] = df_feat['response_a'].str.count('\n')
    df_feat['newline_b'] = df_feat['response_b'].str.count('\n')
    
    return df_feat

# Apply feature extraction to augmented data
df_augmented = extract_features(df_augmented)

# Define the meta features list (expanded)
meta_features = [
    'len_a', 'len_b', 'len_diff', 
    'word_count_a', 'word_count_b', 'word_diff',
    'unique_words_a', 'unique_words_b',
    'has_list_a', 'has_list_b',
    # New features
    'exclaim_a', 'exclaim_b', 'exclaim_diff',
    'question_a', 'question_b',
    'colon_a', 'colon_b',
    'has_url_a', 'has_url_b', 'url_count_a', 'url_count_b',
    'has_latex_a', 'has_latex_b', 'latex_count_a', 'latex_count_b',
    'cap_ratio_a', 'cap_ratio_b', 'cap_ratio_diff',
    'has_code_a', 'has_code_b', 'code_blocks_a', 'code_blocks_b',
    'sentence_count_a', 'sentence_count_b',
    'newline_a', 'newline_b'
]

print(f"Total meta features: {len(meta_features)}")


--- Engineering Enhanced Features ---
Total meta features: 36


**What is TF-IDF doing here?**

TF-IDF (Term Frequency-Inverse Document Frequency) identifies which words are unique or important to a specific response. For example:

If Response A uses words like "accurate," "detailed," and "verified," while Response B uses "sorry," "cannot," and "error," TF-IDF will give those words high scores.
The model will then learn that "sorry" is a strong signal for losing, while "detailed" is a signal for winning.

**The Strategy: The "Vector Difference" Trick**

Since we are comparing two responses, we don't just want to know what words are in A. We want to know what words are in A that are NOT in B.

We turn Response A into a vector of numbers.
We turn Response B into a vector of numbers.
We subtract them: X = Vector_A - Vector_B.
This "Difference Vector" tells the model exactly what the content gap is between the two bots.

In [8]:
# ==========================================
# TF-IDF VECTORIZATION
# ================================================================
print("\n--- Building TF-IDF Features ---")

# Combine all text for fitting TF-IDF (use original df to avoid data leakage)
all_text = pd.concat([df['prompt'], df['response_a'], df['response_b']])

# Initialize TF-IDF with improved parameters
tfidf = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 2), 
    stop_words='english',
    min_df=3,  # Ignore rare terms
    max_df=0.95  # Ignore too common terms
)
tfidf.fit(all_text)

# Transform the augmented data
X_a = tfidf.transform(df_augmented['response_a'])
X_b = tfidf.transform(df_augmented['response_b'])
X_diff = X_a - X_b

# Combine TF-IDF with meta features
X_final = hstack([X_diff, csr_matrix(df_augmented[meta_features].values)])

print(f"Final feature matrix shape: {X_final.shape}")


--- Building TF-IDF Features ---
Final feature matrix shape: (114954, 5036)


In [9]:
# ==========================================
# TRAIN/VAL SPLIT (Using Original Data for Validation)
# ==========================================
print("\n--- Creating Train/Val Split ---")

# Important: We use ONLY original data indices for validation
# to get a fair score estimate (no augmented data in validation)
original_size = len(df)
augmented_size = len(df_augmented)

# Add orig_id for alignment
df_augmented['orig_id'] = np.arange(len(df_augmented))

# Rename target to labels for Trainer compatibility
df_augmented = df_augmented.rename(columns={'target': 'labels'})

# Split: Use original data for validation (indices 0 to original_size-1)
# We'll use 20% of original data for validation
np.random.seed(42)
original_indices = np.arange(original_size)
np.random.shuffle(original_indices)

val_size = int(0.2 * original_size)
val_indices = original_indices[:val_size]
train_indices_original = original_indices[val_size:]

# For training, use both original training indices AND their augmented counterparts
# Augmented versions of original data are at indices original_size + original_index
train_indices_augmented = train_indices_original + original_size
train_indices = np.concatenate([train_indices_original, train_indices_augmented])

# Create splits
X_train = X_final.tocsr()[train_indices]
X_val = X_final.tocsr()[val_indices]

y_train = df_augmented.loc[train_indices, 'labels'].values
y_val = df_augmented.loc[val_indices, 'labels'].values

print(f"Training samples: {len(train_indices)} (augmented)")
print(f"Validation samples: {len(val_indices)} (original only)")


--- Creating Train/Val Split ---
Training samples: 91964 (augmented)
Validation samples: 11495 (original only)


**Why XGBoost instead of Logistic Regression?**

XGBoost is much better than Logistic Regression for this because:

**Feature Selection:** XGBoost automatically ignores features that don't help. It will likely ignore many of those TF-IDF words and focus on the ones that actually matter.

**Non-Linearity:** It can understand that "Length matters, but only if the response also contains certain keywords."

We've also added "Human-Centric Features" (punctuation, URLs, LaTeX, capitalization) to help the model understand quality signals that humans look for.

In [10]:
# ==========================================
# TRAIN XGBOOST
# ================================================================
print("\n--- Training XGBoost ---")

model_xgb = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    objective='multi:softprob',
    num_class=3,
    tree_method='hist', 
    device='cuda',
    random_state=42,
    # Additional regularization
    reg_alpha=0.1,
    reg_lambda=1.0,
    subsample=0.8,
    colsample_bytree=0.8
)

model_xgb.fit(X_train, y_train)

# Evaluate on validation
probs_xgb = model_xgb.predict_proba(X_val)
xgb_score = log_loss(y_val, probs_xgb)
print(f"XGBoost Log Loss: {xgb_score:.4f}")


--- Training XGBoost ---


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


XGBoost Log Loss: 1.0166


## STEP 3: Upgrade DeBERTa to deberta-v3-base

**The Next Level: Tier 3 - DeBERTa-v3 (The Gold Standard)**

To get the score below 1.00 (and eventually toward 0.90 or lower), we need a model that actually understands language. This is where DeBERTa-v3 comes in.

**What makes DeBERTa-v3 different?**
Unlike XGBoost, which looks at a table of numbers, DeBERTa "reads" the text using Self-Attention. It looks at every word in the context of every other word.

**The "Cross-Encoder" Strategy**
In this competition, the most powerful way to use DeBERTa is as a Cross-Encoder. Instead of looking at Response A and Response B separately, we feed them to the model together so it can compare them side-by-side.

**We format the input like this:**
[CLS] Prompt [SEP] Response A [SEP] Response B [SEP]

The model then uses its attention mechanism to "look back and forth" between A and B to decide which one answers the Prompt better.

**Upgrade from small to base:**
- `deberta-v3-small` is the "Lite" version
- `deberta-v3-base` is much larger and more powerful
- Cost: Reduce `per_device_train_batch_size` to 2 and increase `gradient_accumulation_steps` to 8

In [11]:
# ==========================================
# DEBERTA CONFIGURATION
# ==========================================
# UPGRADED: Using deberta-v3-base instead of small
MODEL_NAME = "microsoft/deberta-v3-base"  # UPGRADED from deberta-v3-small
CHECKPOINT_PATH = "/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/"

# For inference mode, you may need to update this path to your new base model checkpoint
# CHECKPOINT_PATH = "./results/checkpoint-XXXX"  # Update after training

In [12]:
# ==========================================
# PREPARE DEBERTA DATASET (WITH AUGMENTATION)
# ==========================================
print("\n--- Preparing DeBERTa Dataset ---")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = [
        f"{p} [SEP] {a} [SEP] {b}" 
        for p, a, b in zip(examples['prompt'], examples['response_a'], examples['response_b'])
    ]
    # OPTIMIZED: Reduced from 1024 to 512 - cuts training time significantly
    # 512 tokens captures most response content while being 4x faster
    return tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

# Create datasets for train and validation
# Training uses augmented data
train_df = df_augmented.iloc[train_indices][['prompt', 'response_a', 'response_b', 'labels', 'orig_id']]
val_df = df_augmented.iloc[val_indices][['prompt', 'response_a', 'response_b', 'labels', 'orig_id']]

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# Tokenize
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

print(f"Tokenized train size: {len(tokenized_train)}")
print(f"Tokenized val size: {len(tokenized_val)}")


--- Preparing DeBERTa Dataset ---


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/91964 [00:00<?, ? examples/s]

Map:   0%|          | 0/11495 [00:00<?, ? examples/s]

Tokenized train size: 91964
Tokenized val size: 11495


In [13]:
# ==========================================
# TRAIN/LOAD DEBERTA
# ==========================================
if TRAIN_MODEL:
    print("\n--- MODE: RESUMING TRAINING DeBERTa-v3-base ---")
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_PATH, num_labels=3)

    
    # OPTIMIZED FOR: 2x T4 GPUs + doubled data + deberta-v3-base
    # Estimated training time: ~8-10 hours with max_steps limit
    args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,                    # Slightly higher for faster convergence
        per_device_train_batch_size=4,         # 4 per GPU x 2 GPUs = 8 total
        per_device_eval_batch_size=8,
        num_train_epochs=2,                    # Reduced from 3.5 - usually sufficient
        weight_decay=0.01,
        eval_strategy="steps",                 # More frequent evaluation
        eval_steps=500,                        # Evaluate every 500 steps
        save_strategy="steps",
        save_steps=500,                        # Checkpoint every 500 steps
        save_total_limit=3,                    # Keep last 3 checkpoints
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=True,
        gradient_accumulation_steps=4,         # Effective batch = 4*4*2 = 32
        report_to="none",
        warmup_ratio=0.06,                     # Less warmup for fewer epochs
        lr_scheduler_type="cosine",
        max_steps=6000,                        # SAFETY LIMIT - prevents timeout
        logging_steps=100,                     # Monitor progress
        dataloader_num_workers=4,              # Faster data loading
    )
else:
    print("\n--- MODE: INFERENCE (Loading Checkpoint) ---")
    # Note: Update CHECKPOINT_PATH to your trained base model checkpoint
    try:
        model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_PATH)
    except:
        print("Warning: Checkpoint not found, loading from base model")
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
    
    args = TrainingArguments(
        output_dir="./temp",
        per_device_eval_batch_size=8,
        fp16=True,
        report_to="none"
    )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)


--- MODE: RESUMING TRAINING DeBERTa-v3-base ---


  trainer = Trainer(


In [14]:
# ==========================================
# EXECUTE TRAINING
# ==========================================
if TRAIN_MODEL:
    print("Fresh training since the learning rate is changed but the previous model weights from the checkpoint-5748 will still be used, but the optimizer/scheduler will reset — which is what we want with new hyperparameters")
    trainer.train()
    
    trainer.save_model("./results/final_model")
    metrics = trainer.evaluate()
    print(f"DeBERTa Validation Log Loss: {metrics['eval_loss']:.4f}")
else:
    print("Skipping training.")

# Get DeBERTa predictions on validation set
print("\nGenerating DeBERTa predictions...")
deberta_output = trainer.predict(tokenized_val)
deberta_probs = softmax(deberta_output.predictions, axis=1)

Fresh training since the learning rate is changed but the previous model weights from the checkpoint-5748 will still be used, but the optimizer/scheduler will reset — which is what we want with new hyperparameters


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss
500,1.0006,1.085784
1000,0.9858,1.151169
1500,0.9682,1.131056
2000,0.9274,1.163157
2500,0.9232,1.168891
3000,0.7937,1.28787
3500,0.7656,1.415448
4000,0.7401,1.379378
4500,0.7362,1.371006
5000,0.7226,1.442357


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

DeBERTa Validation Log Loss: 1.0858

Generating DeBERTa predictions...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [15]:
# ==========================================
# FIND OPTIMAL BLEND
# ==========================================
print("\n--- Searching for Optimal Blend ---")
best_score = 999
best_w = 0

for w in np.linspace(0, 1, 21):  # More granular search: 0.0, 0.05, 0.10 ... 1.0
    blend = (w * probs_xgb) + ((1-w) * deberta_probs)
    score = log_loss(y_val, blend)
    if w in [0.0, 0.3, 0.5, 0.7, 1.0]:  # Print selected weights
        print(f"XGB Weight {w:.2f}: Log Loss {score:.4f}")
    
    if score < best_score:
        best_score = score
        best_w = w

print(f"\n✅ Best Weight: {best_w:.2f} XGB / {1-best_w:.2f} DeBERTa")
print(f"✅ Final Blended Log Loss: {best_score:.4f}")

# Store the best weight for submission
BEST_XGB_WEIGHT = best_w


--- Searching for Optimal Blend ---
XGB Weight 0.00: Log Loss 1.0858
XGB Weight 0.50: Log Loss 1.0259
XGB Weight 1.00: Log Loss 1.0166

✅ Best Weight: 0.85 XGB / 0.15 DeBERTa
✅ Final Blended Log Loss: 1.0146




## STEP 4: Create Kaggle Submission

**The Final Boss — Making a Submission**

Before you improve the score, you should learn how to actually submit to Kaggle. The test.csv file has no labels. You must:

1. Load test.csv
2. Run the same cleaning and feature engineering
3. Get probabilities from XGBoost and DeBERTa
4. Blend them using your optimal weights
5. Save a submission.csv with columns: `id`, `winner_model_a`, `winner_model_b`, `winner_tie`

In [16]:
# ==========================================
# SUBMISSION PIPELINE
# ==========================================
print("\n" + "="*50)
print("GENERATING SUBMISSION")
print("="*50)

# 1. Load the Test Data
test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
print(f"Test data size: {len(test_df)}")

# 2. Apply the SAME cleaning function
print("Cleaning test data...")
test_df['prompt'] = test_df['prompt'].apply(clean_text)
test_df['response_a'] = test_df['response_a'].apply(clean_text)
test_df['response_b'] = test_df['response_b'].apply(clean_text)

# 3. Generate ALL features for test set (using the same function)
print("Extracting features...")
test_df = extract_features(test_df)

# 4. Transform Text using the ALREADY FITTED TF-IDF
X_test_a = tfidf.transform(test_df['response_a'])
X_test_b = tfidf.transform(test_df['response_b'])
X_test_diff = X_test_a - X_test_b

# Combine with meta features
X_test_final = hstack([X_test_diff, csr_matrix(test_df[meta_features].values)])

# 5. Get XGBoost Predictions
print("Getting XGBoost predictions...")
xgb_test_probs = model_xgb.predict_proba(X_test_final)

# 6. Get DeBERTa Predictions
print("Getting DeBERTa predictions...")
test_ds = Dataset.from_pandas(test_df[['prompt', 'response_a', 'response_b']])
tokenized_test = test_ds.map(preprocess_function, batched=True)

deberta_test_output = trainer.predict(tokenized_test)
deberta_test_probs = softmax(deberta_test_output.predictions, axis=1)

# 7. Apply the Optimal Blend
print(f"Blending with weights: {BEST_XGB_WEIGHT:.2f} XGB / {1-BEST_XGB_WEIGHT:.2f} DeBERTa")
final_probs = (BEST_XGB_WEIGHT * xgb_test_probs) + ((1-BEST_XGB_WEIGHT) * deberta_test_probs)

# 8. Create Submission File
submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': final_probs[:, 0],
    'winner_model_b': final_probs[:, 1],
    'winner_tie': final_probs[:, 2]
})

submission.to_csv('submission.csv', index=False)
print("\n✅ Submission file saved: submission.csv")
print(submission.head())


GENERATING SUBMISSION
Test data size: 3
Cleaning test data...
Extracting features...
Getting XGBoost predictions...
Getting DeBERTa predictions...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Blending with weights: 0.85 XGB / 0.15 DeBERTa

✅ Submission file saved: submission.csv
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.281859        0.320766    0.397374
1   211333        0.372683        0.246346    0.380971
2  1233961        0.352883        0.346218    0.300898


## Summary of Changes Made:

### 1. Data Augmentation (Swap Trick)
- Created `df_swapped` by swapping response_a and response_b
- Flipped labels accordingly (A wins ↔ B wins, Tie stays same)
- Combined original + swapped = 2x training data

### 2. Better XGBoost Features
- **Punctuation counts**: exclamation marks, question marks, colons
- **URL detection**: `has_url_a/b`, `url_count_a/b`
- **LaTeX/Math detection**: `has_latex_a/b`, `latex_count_a/b`
- **Capitalization ratio**: `cap_ratio_a/b/diff`
- **Code block detection**: `has_code_a/b`, `code_blocks_a/b`
- **Structure features**: `sentence_count_a/b`, `newline_a/b`

### 3. Upgraded DeBERTa
- Changed from `deberta-v3-small` to `deberta-v3-base`
- Reduced `per_device_train_batch_size` from 4 to 2
- Increased `gradient_accumulation_steps` from 4 to 8
- Added warmup and cosine learning rate schedule

### 4. Submission Pipeline
- Complete pipeline for processing test.csv
- Uses optimal blend weights found during validation
- Outputs properly formatted submission.csv

In [17]:
# Print final summary
print("\n" + "="*50)
print("FINAL SUMMARY")
print("="*50)
print(f"Training data size: {len(train_indices)} (with augmentation)")
print(f"Validation data size: {len(val_indices)}")
print(f"Total features: {X_final.shape[1]}")
print(f"XGBoost Log Loss: {xgb_score:.4f}")
print(f"Optimal blend: {BEST_XGB_WEIGHT:.2f} XGB / {1-BEST_XGB_WEIGHT:.2f} DeBERTa")
print(f"Best blended Log Loss: {best_score:.4f}")


FINAL SUMMARY
Training data size: 91964 (with augmentation)
Validation data size: 11495
Total features: 5036
XGBoost Log Loss: 1.0166
Optimal blend: 0.85 XGB / 0.15 DeBERTa
Best blended Log Loss: 1.0146


In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv
/kaggle/input/llm-classifier-finetuning-continued/__notebook__.ipynb
/kaggle/input/llm-classifier-finetuning-continued/__output__.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/spm.model
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/config.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/trainer_state.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/training_args.bin
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/tokenizer.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/tokenizer_config.json
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint-5748/scaler.pt
/kaggle/input/llm-classifier-finetuning-continued/results/checkpoint