In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
# for dirname, _, filenames in os.walk('.'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# !pip install pytorch_metric_learning


# ---- Imports
import random, math
import numpy as np
import pandas as pd
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from transformers import AutoTokenizer, AutoModel
from transformers import get_linear_schedule_with_warmup


# -----------------------------
# Load and preprocess data
# -----------------------------
# Use Kaggle paths when running on Kaggle
# MODEL_PATH = "/kaggle/input/xlm-roberta-base-offline/xlm_roberta_base_offline"
# MODEL_PATH = "C:/Users/satra/Downloads/xlm_roberta_base_offline"
MODEL_PATH = "xlm-roberta-base"


# trn = "/kaggle/input/jigsaw-agile-community-rules/train.csv"
# tst = "/kaggle/input/jigsaw-agile-community-rules/test.csv"
# trn = "/content/drive/MyDrive/Colab Notebooks/train.csv"
# tst = "/content/drive/MyDrive/Colab Notebooks/test.csv"
trn = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/train.csv"
tst = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/test.csv"

df_trn = pd.read_csv(trn)
df_trn = df_trn.sample(frac=.01, random_state=42).reset_index(drop=True)

df_tst = pd.read_csv(tst)


def fill_empty_examples_pandas(df):
    example_cols = ['positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2']
    for col in example_cols:
        df[col] = df[col].fillna('').astype(str)

    df['positive_example_1'] = df['positive_example_1'].mask(df['positive_example_1'] == '', df['positive_example_2'])
    df['positive_example_2'] = df['positive_example_2'].mask(df['positive_example_2'] == '', df['positive_example_1'])

    df['negative_example_1'] = df['negative_example_1'].mask(df['negative_example_1'] == '', df['negative_example_2'])
    df['negative_example_2'] = df['negative_example_2'].mask(df['negative_example_2'] == '', df['negative_example_1'])

    return df


def get_text(value):
    return str(value) if pd.notna(value) else ''


def extract_texts(row):
    return {
        "body": get_text(row["body"]),
        "rule": get_text(row["rule"]),
        "subreddit": get_text(row["subreddit"]),
        "pos1": f"{get_text(row['positive_example_1'])}",
        "pos2": f"{get_text(row['positive_example_2'])}",
        "neg1": f"{get_text(row['negative_example_1'])}",
        "neg2": f"{get_text(row['negative_example_2'])}",
    }

df_trn = fill_empty_examples_pandas(df_trn)
df_tst = fill_empty_examples_pandas(df_tst)

df_trn["inputs"] = df_trn.apply(extract_texts, axis=1)
df_tst["inputs"] = df_tst.apply(extract_texts, axis=1) # Apply to test data too

N_EPOCHS = 8
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ================================
# Hybrid Jigsaw ACR
# ================================

# ------------------
# CONFIG (edit here)
# ------------------
@dataclass
class CFG:
    model_name: str = "xlm-roberta-base"  # or "distilroberta-base", etc.
    max_len: int = 256
    num_negatives: int = 4               # K (must be constant)
    batch_size: int = 16
    epochs: int = 3
    lr: float = 2e-5
    wd: float = 0.01
    warmup_ratio: float = 0.1
    dropout: float = 0.1
    proj_dim: int = 256
    use_amp: bool = True
    grad_clip: float = 1.0
    pair_margin: float = 0.2
    loss_alpha_cls: float = 0.6          # blend: alpha * BCE + (1-alpha) * pair
    seed: int = 42
    num_workers: int = 2
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

CFG = CFG()

# ----------------
# Reproducibility
# ----------------
def set_seed(s=42):
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed_all(s)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(CFG.seed)

# -------------
# Tokenizer
# -------------
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

# ---------------------
# Dataset (triples, K-)
# ---------------------
class RuleTripleDataset(Dataset):
    """
    Yields a dict with:
      - anchor_* : tokenized body + SEP + rule (paired)
      - pos_*    : tokenized rule (positive)
      - neg_*    : tokenized K negative rules (tensor [K, L])
      - label    : float32 in [0,1] (omitted if is_test=True)
    Expects df with columns: 'body', 'rule', and (for train/val) 'rule_violation'
    """
    def __init__(self, df, tokenizer, num_negatives=4, max_len=256, is_test=False, seed=42):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.num_negatives = num_negatives
        self.max_len = max_len
        self.is_test = is_test

        rng = np.random.default_rng(seed)
        self.all_rules = self.df["rule"].astype(str).tolist()

        n = len(self.df)
        all_idx = np.arange(n)
        self.neg_indices = []
        for i in range(n):
            pool = np.delete(all_idx, i)
            replace = len(pool) < self.num_negatives
            chosen = rng.choice(pool, size=self.num_negatives, replace=replace)
            self.neg_indices.append(chosen.tolist())

        # robust sep
        self.sep = self.tokenizer.sep_token or " </s> "

    def __len__(self):
        return len(self.df)

    def _tok(self, texts):
        # Supports single string or a list of strings
        return self.tokenizer(
            texts,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        body = str(row["body"])
        rule = str(row["rule"])

        # Anchor: body + SEP + rule
        anc_text = body + self.sep + rule
        anc = self._tok(anc_text)
        anchor_input_ids = anc["input_ids"].squeeze(0).to(torch.long)
        anchor_attention_mask = anc["attention_mask"].squeeze(0).to(torch.long)

        # Positive rule text
        pos = self._tok(rule)
        pos_input_ids = pos["input_ids"].squeeze(0).to(torch.long)
        pos_attention_mask = pos["attention_mask"].squeeze(0).to(torch.long)

        # Negatives: K other rules
        neg_rules = [self.all_rules[j] for j in self.neg_indices[idx]]
        neg = self._tok(neg_rules)
        neg_input_ids = neg["input_ids"].to(torch.long)                 # [K, L]
        neg_attention_mask = neg["attention_mask"].to(torch.long)       # [K, L]

        item = {
            "anchor_input_ids": anchor_input_ids,
            "anchor_attention_mask": anchor_attention_mask,
            "pos_input_ids": pos_input_ids,
            "pos_attention_mask": pos_attention_mask,
            "neg_input_ids": neg_input_ids,
            "neg_attention_mask": neg_attention_mask,
        }
        if not self.is_test:
            label = float(row["rule_violation"])
            item["label"] = torch.tensor(label, dtype=torch.float32)

        print(f'Item: {item}')
        return item

# -------------
# Model
# -------------
class DualEncoderHybrid(nn.Module):
    """
    - Shared transformer encoder for both anchor and rules
    - Two projection heads:
        * proj_pair  -> normalized embeddings for pair/contrastive loss
        * proj_cls   -> features for BCE head (unnormalized)
    - Returns: logits, anchor_emb, pos_emb, neg_emb
    """
    def __init__(self, model_name, hidden_size=None, proj_dim=256, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        if hidden_size is None:
            # heuristic: most base models have 768 hidden size; pull from config
            hidden_size = getattr(self.encoder.config, "hidden_size", 768)
        self.dropout = nn.Dropout(dropout)
        self.proj_pair = nn.Linear(hidden_size, proj_dim)
        self.proj_cls  = nn.Linear(hidden_size, proj_dim)
        self.cls_head  = nn.Linear(proj_dim, 1)

    def _encode_hidden(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0]  # CLS
        return self.dropout(cls)

    def forward(
        self,
        anchor_input_ids,
        anchor_attention_mask,
        pos_input_ids,
        pos_attention_mask,
        neg_input_ids,
        neg_attention_mask,
    ):
        B = anchor_input_ids.size(0)
        L = anchor_input_ids.size(1)
        K = neg_input_ids.size(0) // B if neg_input_ids.dim() == 2 else neg_input_ids.size(1)

        # Anchor & positive
        h_anchor = self._encode_hidden(anchor_input_ids, anchor_attention_mask)            # [B, H]
        h_pos    = self._encode_hidden(pos_input_ids, pos_attention_mask)                  # [B, H]

        # Negatives: flatten then reshape
        if neg_input_ids.dim() == 3:
            h_neg = self._encode_hidden(
                neg_input_ids.view(B*K, -1),
                neg_attention_mask.view(B*K, -1)
            ).view(B, K, -1)                                                               # [B, K, H]
        else:
            raise ValueError("neg_input_ids must be [B, K, L]")

        # Pair embeddings (cosine)
        anchor_emb = nn.functional.normalize(self.proj_pair(h_anchor), dim=-1)             # [B, D]
        pos_emb    = nn.functional.normalize(self.proj_pair(h_pos), dim=-1)                # [B, D]
        neg_emb    = nn.functional.normalize(self.proj_pair(h_neg), dim=-1)                # [B, K, D]

        # Classification branch
        cls_feat = self.proj_cls(h_anchor)
        logits = self.cls_head(cls_feat).squeeze(-1)                                       # [B]
        return logits, anchor_emb, pos_emb, neg_emb

# -------------
# Losses
# -------------
bce_loss_fn = nn.BCEWithLogitsLoss()

def pairwise_margin_loss(anchor_emb, pos_emb, neg_emb, margin=0.2):
    """
    Max(0, margin + sim(a, neg) - sim(a, pos)), averaged over batch and negatives.
    Assumes normalized embeddings -> dot product = cosine similarity.
    """
    # [B]
    pos_sim = torch.sum(anchor_emb * pos_emb, dim=-1)
    # [B, K]
    neg_sim = torch.sum(anchor_emb.unsqueeze(1) * neg_emb, dim=-1)
    loss = torch.relu(margin + neg_sim - pos_sim.unsqueeze(1))
    return loss.mean()

# --------------------
# Training / Evaluation
# --------------------
def make_loader(df, is_test=False):
    ds = RuleTripleDataset(
        df, tokenizer,
        num_negatives=CFG.num_negatives,
        max_len=CFG.max_len,
        is_test=is_test,
        seed=CFG.seed
    )
    return DataLoader(
        ds,
        batch_size=CFG.batch_size,
        shuffle=not is_test,
        num_workers=CFG.num_workers,
        pin_memory=True,
        persistent_workers=(CFG.num_workers > 0) and (not is_test),
        drop_last=False
    )

def run_one_epoch(model, loader, optimizer=None, scaler=None):
    is_train = optimizer is not None
    model.train() if is_train else model.eval()

    total_loss, nsteps = 0.0, 0
    all_probs, all_labels = [], []

    for batch in loader:
        batch = {k: v.to(CFG.device) if torch.is_tensor(v) else v for k, v in batch.items()}

        with torch.cuda.amp.autocast(enabled=(CFG.use_amp and is_train)):
            logits, a_emb, p_emb, n_emb = model(
                batch["anchor_input_ids"], batch["anchor_attention_mask"],
                batch["pos_input_ids"],    batch["pos_attention_mask"],
                batch["neg_input_ids"],    batch["neg_attention_mask"],
            )

            if "label" in batch:
                loss_cls = bce_loss_fn(logits, batch["label"])
            else:
                # test-time: no labels; only forward pass
                loss_cls = torch.zeros((), device=CFG.device)

            loss_pair = pairwise_margin_loss(a_emb, p_emb, n_emb, margin=CFG.pair_margin)
            loss = CFG.loss_alpha_cls * loss_cls + (1 - CFG.loss_alpha_cls) * loss_pair
            print(f"Loss: {loss}")

        if is_train:
            optimizer.zero_grad(set_to_none=True)
            if CFG.use_amp:
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                if CFG.grad_clip is not None:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                if CFG.grad_clip is not None:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
                optimizer.step()

        total_loss += loss.item()
        nsteps += 1

        # metrics buffer (only if labels exist)
        if "label" in batch:
            probs = torch.sigmoid(logits).detach().cpu().numpy().tolist()
            labels = batch["label"].detach().cpu().numpy().tolist()
            all_probs.extend(probs)
            all_labels.extend(labels)

    avg_loss = total_loss / max(nsteps, 1)
    auc = roc_auc_score(all_labels, all_probs) if all_labels else None
    return avg_loss, auc, (all_probs, all_labels)

def train_hybrid(df_train, df_val, df_test=None):
    model = DualEncoderHybrid(
        model_name=CFG.model_name,
        proj_dim=CFG.proj_dim,
        dropout=CFG.dropout
    ).to(CFG.device)

    # Optimizer & Scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.wd)
    steps_per_epoch = math.ceil(len(df_train) / CFG.batch_size)
    num_train_steps = steps_per_epoch * CFG.epochs
    num_warmup_steps = int(num_train_steps * CFG.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps
    )

    scaler = torch.cuda.amp.GradScaler(enabled=CFG.use_amp)

    # Loaders
    train_loader = make_loader(df_train, is_test=False)
    val_loader   = make_loader(df_val,   is_test=False)
    test_loader  = make_loader(df_test,  is_test=True) if df_test is not None else None

    best_auc, best_state = -1.0, None

    for epoch in range(1, CFG.epochs + 1):
        tr_loss, tr_auc, _ = run_one_epoch(model, train_loader, optimizer=optimizer, scaler=scaler)
        scheduler.step()

        val_loss, val_auc, _ = run_one_epoch(model, val_loader, optimizer=None, scaler=None)

        print(f"Epoch {epoch:02d} | TrainLoss {tr_loss:.4f}"
              f"{' | TrainAUC ' + f'{tr_auc:.4f}' if tr_auc is not None else ''}"
              f" | ValLoss {val_loss:.4f} | ValAUC {val_auc:.4f}")

        if val_auc is not None and val_auc > best_auc:
            best_auc = val_auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    print(f"Best Val AUC: {best_auc:.4f}" if best_auc >= 0 else "No validation AUC computed.")
    if best_state is not None:
        model.load_state_dict(best_state)

    # Test inference
    test_probs = None
    if test_loader is not None:
        model.eval()
        all_probs = []
        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(CFG.device) if torch.is_tensor(v) else v for k, v in batch.items()}
                logits, _, _, _ = model(
                    batch["anchor_input_ids"], batch["anchor_attention_mask"],
                    batch["pos_input_ids"],    batch["pos_attention_mask"],
                    batch["neg_input_ids"],    batch["neg_attention_mask"],
                )
                probs = torch.sigmoid(logits).detach().cpu().numpy().tolist()
                all_probs.extend(probs)
        test_probs = np.array(all_probs)

    return model, test_probs

# --------------------------
# Convenience: data splitting
# --------------------------
from sklearn.model_selection import train_test_split

def split_train_val(df, val_size=0.1, seed=42):
    strat = (df["rule_violation"] > 0.5).astype(int) if "rule_violation" in df.columns else None
    tr, va = train_test_split(df, test_size=val_size, random_state=seed, stratify=strat)
    return tr.reset_index(drop=True), va.reset_index(drop=True)

# --------------------------
# Quick Sanity Usage Example
# --------------------------
# Expect df_train to have columns: "body", "rule", "rule_violation"
# Expect df_test  to have columns: "body", "rule" (no label)
#
train_df, val_df = split_train_val(df_trn, val_size=0.1, seed=CFG.seed)
model, test_probs = train_hybrid(train_df, val_df, df_tst)
#
# # If you need a submission:
sub = pd.DataFrame({"prediction": test_probs})
sub.to_csv("submission.csv", index=False)
print("Saved submission.csv")


  scaler = torch.cuda.amp.GradScaler(enabled=CFG.use_amp)


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')