# Udhgam 2.0 ML Challenge
## Team Name: Encoder-Decoder

### Overview & Approach
The goal of this competition was to predict a "prompt quality" score from obfuscated token sequences without access to the original vocabulary or pre-trained embeddings (like BERT).

To solve this, we developed a CRNN-Attention Hybrid Architecture that combines:

1) Learned Embeddings: Training fresh embeddings from scratch to capture token relationships.

2) 1D-CNN: To capture local n-gram patterns (phrasing/texture).

3) Bi-GRU: To capture long-range sequential dependencies and flow.

4) Attention Mechanism: To allow the model to dynamically focus on the most critical tokens in the prompt.

5) Meta-Features: Explicitly injecting statistical signals (length, complexity, lexical diversity) into the final layer.

Key Optimization Techniques:

* Pseudo-Labeling: We utilized our previous best submission to generate soft labels for the test set, expanding our training manifold.

* Test-Time Augmentation (TTA): We used Monte Carlo Dropout (inference with dropout enabled) to simulate an ensemble of 10 models, significantly reducing variance.

# Configuration

In [None]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW, lr_scheduler
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import os
import gc

# ==========================================
# CONFIGURATION
# ==========================================
class Config:
    # Paths (Update these to match your environment)
    TRAIN_PATH = '/kaggle/input/ml-challenge-udhgam-2/train.jsonl'
    TEST_PATH = '/kaggle/input/ml-challenge-udhgam-2/test.jsonl'
    
    # Path to previous best submission for Pseudo-Labeling
    # This acts as an "Anchor" to stabilize training
    ANCHOR_SUB_PATH = 'best_submission.csv' 
    
    # Model Architecture
    MAX_LEN = 128
    EMBED_DIM = 256
    HIDDEN_DIM = 256
    DROPOUT = 0.3
    
    # Training Hyperparameters
    BATCH_SIZE = 64
    EPOCHS = 6
    LR = 5e-4
    N_FOLDS = 5
    
    # Robust Seed
    SEED = 2026 
    
    # Inference: Test-Time Augmentation Steps
    TTA_STEPS = 10 
    
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def set_seed(seed):
    """Sets the seed for reproducibility across runs."""
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed(Config.SEED)
print(f"Running on Device: {Config.DEVICE}")

# DATA LOADING & PRE-PROCESSING

In [None]:
# DATA LOADING & PRE-PROCESSING

def load_data(path):
    data = []
    with open(path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# 1. Load Raw Data
train_df = load_data(Config.TRAIN_PATH)
test_df = load_data(Config.TEST_PATH)

# 2. Pseudo-Labeling
# We merge the predictions from our previous best model to use as soft targets
if os.path.exists(Config.ANCHOR_SUB_PATH):
    print(f"Loading Anchor Submission for Pseudo-Labeling: {Config.ANCHOR_SUB_PATH}")
    anchor_sub = pd.read_csv(Config.ANCHOR_SUB_PATH)
    test_df = test_df.merge(anchor_sub[['example_id', 'label']], on='example_id', how='left')
    
    # Mark real training data vs pseudo data
    train_df['is_pseudo'] = 0
    test_df['is_pseudo'] = 1
    
    # Combine into one massive dataset
    full_train_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
else:
    print("WARNING: Anchor submission not found. Training on labeled data only.")
    full_train_df = train_df

# 3. Meta-Feature Extraction
def get_meta_features(df):
    """
    Extracts statistical features from token sequences.
    - Length: Proxy for prompt detail.
    - Unique Count / Ratio: Proxy for lexical diversity.
    - Start/End Tokens: Often capture framing (e.g., 'Write...', '...in JSON').
    - Std Dev: Measures the spread of token IDs.
    """
    feats = []
    for ids in df['input_ids']:
        l = len(ids)
        u = len(set(ids))
        if l == 0: l = 1 # Safety
        feats.append([l, u, u/l, ids[0], ids[-1], np.std(ids)])
    return np.array(feats)

print("Generating Meta Features...")
full_meta = get_meta_features(full_train_df)
test_meta_only = get_meta_features(test_df)

# Normalize Meta Features (Critical for Neural Network stability)
scaler = StandardScaler()
full_meta = scaler.fit_transform(full_meta)
test_meta_only = scaler.transform(test_meta_only)

# Determine Vocabulary Size
VOCAB_SIZE = max([max(x) for x in full_train_df['input_ids']]) + 2
print(f"Vocabulary Size: {VOCAB_SIZE}")

# 4. Dataset Class
class UnifiedDataset(Dataset):
    def __init__(self, df, meta, max_len, is_test=False):
        self.input_ids = df['input_ids'].values
        self.meta = meta
        self.max_len = max_len
        self.is_test = is_test
        if not is_test:
            self.labels = df['label'].values
            
    def __len__(self): return len(self.input_ids)
    
    def __getitem__(self, idx):
        ids = self.input_ids[idx]
        # Pad or Truncate to Fixed Length
        if len(ids) > self.max_len: 
            ids = ids[:self.max_len]
        else: 
            ids = ids + [0] * (self.max_len - len(ids))
            
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'meta': torch.tensor(self.meta[idx], dtype=torch.float),
            'label': torch.tensor(self.labels[idx], dtype=torch.float) if not self.is_test else torch.tensor(0.0)
        }

# Model Architecture: CRNN-Attention
This is the core of our solution. We combined Convolutional layers with Recurrent layers to get the best of both worlds:

* Conv1D: Extracts local features (like n-grams) efficiently.

* Bi-GRU: Captures global context and sequence flow.

Attention: Computes a weighted sum of the hidden states, allowing the model to emphasise important tokens dynamically.

In [None]:
class CRNNAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, meta_dim):
        super().__init__()
        # 1. Learnable Embeddings
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # 2. Convolutional Layer (Local Pattern Extraction)
        self.conv1 = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        
        # 3. Recurrent Layer (Sequence Modeling)
        self.gru = nn.GRU(hidden_dim, hidden_dim, num_layers=2, 
                          batch_first=True, bidirectional=True, dropout=0.2)
        
        # 4. Attention Mechanism
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim * 2, 64),
            nn.Tanh(),
            nn.Linear(64, 1),
            nn.Softmax(dim=1)
        )
        
        # 5. Meta-Feature Fusion Head
        self.meta_fc = nn.Linear(meta_dim, 32)
        
        # 6. Final Regressor
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2 + 32, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )
        
    def forward(self, input_ids, meta):
        # Embed: [Batch, Len, Dim]
        x = self.embedding(input_ids)
        
        # Conv: [Batch, Dim, Len]
        x = x.permute(0, 2, 1)
        x = torch.relu(self.bn1(self.conv1(x)))
        x = x.permute(0, 2, 1) # Back to [Batch, Len, Dim]
        
        # GRU: Output [Batch, Len, Hidden*2]
        h, _ = self.gru(x)
        
        # Attention: Calculate weights for each time step
        attn_weights = self.attention(h)
        # Context: Weighted sum of hidden states
        context = torch.sum(h * attn_weights, dim=1)
        
        # Process Meta-Features
        m = torch.relu(self.meta_fc(meta))
        
        # Concatenate Context + Meta
        combined = torch.cat((context, m), dim=1)
        
        # Output Score (Sigmoid ensures 0-1 range)
        return torch.sigmoid(self.fc(combined)).squeeze()

# TRAINING & INFERENCE UTILS

In [None]:
# TRAINING & INFERENCE UTILS

criterion = nn.SmoothL1Loss(beta=0.01)

def train_fn(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    for batch in loader:
        ids = batch['input_ids'].to(device)
        meta = batch['meta'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        output = model(ids, meta)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

# --- MAIN EXECUTION LOOP ---
kf = KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.SEED)
final_preds = np.zeros(len(test_df))

print(f"Starting Training with {Config.N_FOLDS}-Fold CV...")

for fold, (train_idx, val_idx) in enumerate(kf.split(full_train_df)):
    print(f"Fold {fold+1}/{Config.N_FOLDS}")
    
    # Create Datasets
    train_ds = UnifiedDataset(full_train_df.iloc[train_idx], full_meta[train_idx], Config.MAX_LEN)
    real_test_ds = UnifiedDataset(test_df, test_meta_only, Config.MAX_LEN, is_test=True)
    
    train_loader = DataLoader(train_ds, batch_size=Config.BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(real_test_ds, batch_size=Config.BATCH_SIZE, shuffle=False)
    
    # Initialize Model
    model = CRNNAttention(VOCAB_SIZE, Config.EMBED_DIM, Config.HIDDEN_DIM, 6).to(Config.DEVICE)
    optimizer = AdamW(model.parameters(), lr=Config.LR, weight_decay=1e-4)
    scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=Config.LR, steps_per_epoch=len(train_loader), epochs=Config.EPOCHS)
    
    # Train
    for epoch in range(Config.EPOCHS):
        train_fn(model, train_loader, optimizer, scheduler, criterion, Config.DEVICE)
    
    # --- TTA INFERENCE ---
    # We keep the model in .train() mode to enable Dropout during inference
    model.train() 
    fold_tta_preds = np.zeros(len(test_df))
    
    with torch.no_grad():
        for t in range(Config.TTA_STEPS):
            batch_preds = []
            for batch in test_loader:
                ids = batch['input_ids'].to(Config.DEVICE)
                meta = batch['meta'].to(Config.DEVICE)
                out = model(ids, meta)
                batch_preds.extend(out.cpu().numpy())
            fold_tta_preds += np.array(batch_preds)
            
    # Average the TTA predictions
    final_preds += (fold_tta_preds / Config.TTA_STEPS) / Config.N_FOLDS
    
    # Cleanup
    del model, optimizer
    torch.cuda.empty_cache()
    gc.collect()

print("Training Complete.")

In [None]:
# FINAL BLEND & CALIBRATION

print("Blending Predictions...")

if os.path.exists(Config.ANCHOR_SUB_PATH):
    # Weighted Blend: 85% Anchor (Stability) + 15% New TTA (Refinement)
    # This conservative blend protects against overfitting while adding the TTA gains
    anchor_preds = pd.read_csv(Config.ANCHOR_SUB_PATH)['label'].values
    final_blend = (0.85 * anchor_preds) + (0.15 * final_preds)
else:
    final_blend = final_preds

# Calibration: Shift predictions to match the Training Mean
# This corrects systematic bias (e.g., if model consistently predicts too low)
train_mean = full_train_df[full_train_df['is_pseudo']==0]['label'].mean()
pred_mean = final_blend.mean()
diff = train_mean - pred_mean

print(f"Calibration Shift: {diff:.5f}")
final_blend += diff

# Clip to ensure valid range
final_blend = np.clip(final_blend, 0, 1)

# Save Submission
submission = pd.DataFrame({
    'example_id': test_df['example_id'],
    'label': final_blend
})

submission.to_csv('submission.csv', index=False)
print("Submission Saved: submission.csv")