In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
save_path = "paraphrase-multilingual-MiniLM-L12-v2"
NLI_BACKBONE = save_path

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('paraphrase-multilingual-MiniLM-L12-v2\\tokenizer_config.json',
 'paraphrase-multilingual-MiniLM-L12-v2\\special_tokens_map.json',
 'paraphrase-multilingual-MiniLM-L12-v2\\unigram.json',
 'paraphrase-multilingual-MiniLM-L12-v2\\added_tokens.json',
 'paraphrase-multilingual-MiniLM-L12-v2\\tokenizer.json')

In [2]:
# ========= Setup (CPU-friendly) =========
import os, numpy as np, pandas as pd
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"

from sentence_transformers import CrossEncoder
from scipy.special import softmax

# Pick an NLI cross-encoder (English):
# NLI_BACKBONE = "C:/Users/satra/PycharmProjects/kaggle-jigsaw/nli-deberta-v3-local"

# If you need multilingual, try:
# NLI_BACKBONE = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"

# ========= Data =========
TEST_CSV = "C:/Users/satra/PycharmProjects/kaggle-jigsaw/test.csv"
df_test = pd.read_csv(TEST_CSV)

def make_hypothesis(rule, subreddit=None):
    rule = str(rule)
    sub  = str(subreddit) if pd.notna(subreddit) else ""
    return f"This post violates the rule: {rule}." if not sub else \
           f"This post violates the rule: {rule} in subreddit r/{sub}."

premises   = df_test["body"].astype(str).tolist()
hypotheses = [make_hypothesis(r, s) for r, s in zip(df_test["rule"], df_test["subreddit"])]
pairs = list(zip(premises, hypotheses))

# ========= Model =========
# If you uploaded a local copy as a Kaggle Dataset, pass that folder path instead.
nli = CrossEncoder(NLI_BACKBONE)  # CPU by default

# Batched predict (adjust batch_size if RAM allows)
logits = nli.predict(pairs, convert_to_numpy=True, show_progress_bar=True, batch_size=64)

# Most NLI cross-encoders return 3 logits: [entailment, neutral, contradiction].
# If yours differs, print logits.shape and adjust the index below.
probs = softmax(logits, axis=1) if logits.ndim == 2 else np.vstack([logits, 1 - logits]).T
p_entail = probs[:, 0] if probs.shape[1] >= 3 else probs[:, 1]  # entailment column

# ========= Submission =========
sub = pd.DataFrame({"row_id": df_test["row_id"], "rule_violation": p_entail.astype(float)})
sub.to_csv("submission.csv", index=False)
print("submission.csv written", sub.shape)


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]

submission.csv written (10, 2)





In [3]:
import numpy as np
import pandas as pd

# Use the same df you used to create val_rows earlier
# Must contain: 'body', 'rule', 'subreddit', 'rule_violation'
# Example: df_val = df_trn.copy()  # or however you defined your validation slice
TRAIN_CSV = "C:/Users/satra/PycharmProjects/kaggle-jigsaw/train.csv"
df_trn = pd.read_csv(TRAIN_CSV)
df_val = df_trn.copy()  # replace with your actual validation frame

# ----- Build premise/hypothesis pairs -----
def make_hypothesis(rule, subreddit=None):
    r = str(rule)
    s = str(subreddit) if pd.notna(subreddit) else ""
    return f"This post violates the rule: {r}." if not s else \
           f"This post violates the rule: {r} in subreddit r/{s}."

premises   = df_val["body"].astype(str).tolist()
hypotheses = [make_hypothesis(r, s) for r, s in zip(df_val["rule"], df_val["subreddit"])]
pairs_val  = list(zip(premises, hypotheses))

y_val = df_val["rule_violation"].astype(int).to_numpy()

# ----- Load an NLI cross-encoder (use a local path if offline) -----
from sentence_transformers import CrossEncoder
from scipy.special import softmax

# If you’ve saved the model locally for offline use, point to that folder instead:
# NLI_BACKBONE = "/kaggle/input/nli-deberta-v3-local"
NLI_BACKBONE = "cross-encoder/nli-deberta-v3-base"

nli = CrossEncoder(NLI_BACKBONE)  # CPU by default

# Batch predict (adjust batch_size; 64–128 is fine on Kaggle CPU)
logits = nli.predict(pairs_val, convert_to_numpy=True, batch_size=64, show_progress_bar=True)

# Most cross-encoder NLI models return 3 logits [entailment, neutral, contradiction].
# Softmax to probabilities; pick the entailment column.
probs = softmax(logits, axis=1) if logits.ndim == 2 else np.vstack([logits, 1 - logits]).T
# If your model’s label order differs, change the index below accordingly:
p_entail = probs[:, 0] if probs.shape[1] >= 3 else probs[:, 1]

# ----- Evaluate -----
from sklearn.metrics import roc_auc_score, average_precision_score

roc_auc = roc_auc_score(y_val, p_entail)
pr_auc  = average_precision_score(y_val, p_entail)  # useful if class imbalance

print(f"NLI zero-shot ROC AUC: {roc_auc:.4f}")
print(f"NLI zero-shot PR  AUC: {pr_auc:.4f}")


Batches: 100%|██████████| 32/32 [10:28<00:00, 19.65s/it]

NLI zero-shot ROC AUC: 0.5803
NLI zero-shot PR  AUC: 0.5996





In [4]:
from sentence_transformers import CrossEncoder
from scipy.special import softmax
import numpy as np

import numpy as np
import pandas as pd
from sentence_transformers import CrossEncoder
from scipy.special import softmax
from sklearn.metrics import roc_auc_score, average_precision_score

# ---------- 1) Hypothesis builders ----------
def hyp_with_examples(rule, sub=None, pos1="", pos2="", neg1="", neg2="", negate=False):
    """
    Builds a short, information-dense hypothesis.
    negate=False  -> "This post violates the rule ..."
    negate=True   -> "This post does NOT violate the rule ..."
    """
    rule = str(rule or "")
    sub  = str(sub or "")
    base = "does NOT violate" if negate else "violates"
    sub_txt = f" in subreddit r/{sub}" if sub else ""
    # keep examples concise; they often help zero-shot NLI
    pos_bits = [str(x).strip() for x in (pos1, pos2) if isinstance(x, str) and x.strip()]
    neg_bits = [str(x).strip() for x in (neg1, neg2) if isinstance(x, str) and x.strip()]
    extra = []
    if pos_bits: extra.append("e.g. " + " | ".join(pos_bits))
    if neg_bits: extra.append("counterexamples: " + " | ".join(neg_bits))
    extra_txt = (" " + " ".join(extra)) if extra else ""
    return f"This post {base} the rule{sub_txt}: {rule}.{extra_txt}".strip()

# ---------- 2) Robust scorer ----------
def score_pairs(nli: CrossEncoder, premises, hypotheses, batch_size=64):
    """
    Returns (entail, neutral, contradiction) probabilities as arrays.
    Works with common SBERT CrossEncoder NLI heads.
    """
    logits = nli.predict(list(zip(premises, hypotheses)),
                         convert_to_numpy=True, show_progress_bar=True, batch_size=batch_size)

    # Convert to probs
    probs = softmax(logits, axis=1) if logits.ndim == 2 else np.vstack([logits, 1-logits]).T

    # Try to locate label indices from config if present
    entail_idx = 0
    neutral_idx = 1 if probs.shape[1] >= 2 else None
    contra_idx = 2 if probs.shape[1] >= 3 else None

    try:
        id2label = getattr(nli.model.config, "id2label", None)
        if isinstance(id2label, dict) and len(id2label) == probs.shape[1]:
            rev = {v.lower(): int(k) for k, v in id2label.items()}
            entail_idx = rev.get("entailment", entail_idx)
            neutral_idx = rev.get("neutral", neutral_idx)
            contra_idx = rev.get("contradiction", contra_idx)
    except Exception:
        pass

    entail = probs[:, entail_idx]
    neutral = probs[:, neutral_idx] if neutral_idx is not None else 1.0 - entail
    contra  = probs[:, contra_idx]  if contra_idx  is not None else 1.0 - entail
    return entail, neutral, contra

# ---------- 3) End-to-end evaluation ----------
def eval_nli_auc(
    df_val: pd.DataFrame,
    nli_model_path_or_name: str = NLI_BACKBONE,
    batch_size: int = 64,
    use_examples: bool = True,
):
    """
    df_val must contain: body, rule, subreddit, rule_violation,
                         positive_example_1/2, negative_example_1/2 (if use_examples=True)
    Returns: dict of AUCs for several combination rules, and the chosen 'p_violate' vector.
    """
    # Premises and hypotheses
    prem = df_val["body"].astype(str).tolist()
    if use_examples:
        h1 = [hyp_with_examples(r, s, pe1,   pe2,   ne1,   ne2,   negate=False)
              for r,s,pe1,pe2,ne1,ne2 in zip(
                  df_val["rule"], df_val["subreddit"],
                  df_val.get("positive_example_1",""), df_val.get("positive_example_2",""),
                  df_val.get("negative_example_1",""), df_val.get("negative_example_2",""),
              )]
        h2 = [hyp_with_examples(r, s, pe1,   pe2,   ne1,   ne2,   negate=True)
              for r,s,pe1,pe2,ne1,ne2 in zip(
                  df_val["rule"], df_val["subreddit"],
                  df_val.get("positive_example_1",""), df_val.get("positive_example_2",""),
                  df_val.get("negative_example_1",""), df_val.get("negative_example_2",""),
              )]
    else:
        h1 = [hyp_with_examples(r, s, negate=False) for r,s in zip(df_val["rule"], df_val["subreddit"])]
        h2 = [hyp_with_examples(r, s, negate=True)  for r,s in zip(df_val["rule"], df_val["subreddit"])]

    # Load model (use a local folder path here for offline Kaggle)
    nli = CrossEncoder(nli_model_path_or_name)

    # Score both templates
    e1, n1, c1 = score_pairs(nli, prem, h1, batch_size=batch_size)  # 'violates'
    e2, n2, c2 = score_pairs(nli, prem, h2, batch_size=batch_size)  # 'does NOT violate'

    y = df_val["rule_violation"].astype(int).to_numpy()

    # Combination rules → a single "violation" score in [0,1]
    p_entail_only   = e1
    p_two_template  = 0.5 * e1 + 0.5 * (1.0 - e2)           # recommended simple combiner
    p_sym_contrast  = 0.5 * (e1 + c2)                       # uses contradiction of "not violate"
    p_diff          = (e1 - e2 + 1.0) / 2.0                 # map [-1,1] → [0,1]

    # Compute metrics
    def aucs(p):
        return {
            "roc_auc": roc_auc_score(y, p),
            "pr_auc":  average_precision_score(y, p),
        }

    results = {
        "entail_only":  aucs(p_entail_only),
        "two_template": aucs(p_two_template),
        "sym_contrast": aucs(p_sym_contrast),
        "diff_scaled":  aucs(p_diff),
    }

    # Pick the best combiner by ROC AUC
    best_name = max(results, key=lambda k: results[k]["roc_auc"])
    best_p = {"entail_only": p_entail_only, "two_template": p_two_template,
              "sym_contrast": p_sym_contrast, "diff_scaled": p_diff}[best_name]

    print("NLI zero-shot AUCs:")
    for k, v in results.items():
        print(f"  {k:>12}  ROC {v['roc_auc']:.4f} | PR {v['pr_auc']:.4f}")
    print(f"→ Using '{best_name}' as the violation score.")

    return {"results": results, "p_violate": best_p, "best_name": best_name}

# ----------------- Usage -----------------
# df_val = <your validation dataframe with the required columns>
res = eval_nli_auc(df_val, nli_model_path_or_name=NLI_BACKBONE, batch_size=64, use_examples=True)
p_violate = res["p_violate"]  # single vector you can blend or submit (on test)


Batches: 100%|██████████| 32/32 [1:03:52<00:00, 119.77s/it]
Batches: 100%|██████████| 32/32 [1:08:33<00:00, 128.55s/it]

NLI zero-shot AUCs:
   entail_only  ROC 0.4522 | PR 0.4758
  two_template  ROC 0.5156 | PR 0.5042
  sym_contrast  ROC 0.4982 | PR 0.5052
   diff_scaled  ROC 0.5156 | PR 0.5042
→ Using 'two_template' as the violation score.





In [5]:
import numpy as np
import pandas as pd
from sentence_transformers import CrossEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from scipy.special import softmax

def hyp_with_examples(rule, subreddit=None, pos1="", pos2="", neg1="", neg2="", negate=False):
    base = "does NOT violate" if negate else "violates"
    sub_txt = f" in subreddit r/{subreddit}" if isinstance(subreddit, str) and len(subreddit) else ""
    pos_bits = [str(x).strip() for x in (pos1, pos2) if isinstance(x, str) and x.strip()]
    neg_bits = [str(x).strip() for x in (neg1, neg2) if isinstance(x, str) and x.strip()]
    extras = []
    if pos_bits: extras.append("e.g. " + " | ".join(pos_bits))
    if neg_bits: extras.append("counterexamples: " + " | ".join(neg_bits))
    extra_txt = (" " + " ".join(extras)) if extras else ""
    return f"This post {base} the rule{sub_txt}: {str(rule)}.{extra_txt}".strip()

def get_label_indices(nli):
    # Map entail/neutral/contradiction indices from model config if available
    e_idx, n_idx, c_idx = 0, 1, 2
    try:
        id2label = nli.model.config.id2label
        rev = {v.lower(): int(k) for k, v in id2label.items()}
        e_idx = rev.get("entailment", e_idx)
        n_idx = rev.get("neutral", n_idx)
        c_idx = rev.get("contradiction", c_idx)
    except Exception:
        pass
    return e_idx, n_idx, c_idx

def nli_logits(nli: CrossEncoder, pairs, batch_size=64):
    # CrossEncoder.predict returns raw scores (logits) by default
    return nli.predict(pairs, convert_to_numpy=True, show_progress_bar=True, batch_size=batch_size)

def build_nli_features(df: pd.DataFrame, nli: CrossEncoder, batch_size=64, use_examples=True):
    """Return X (features) and the label y if present."""
    prem = df["body"].astype(str).tolist()
    if use_examples:
        h1 = [hyp_with_examples(r,s,pe1,pe2,ne1,ne2,negate=False) for r,s,pe1,pe2,ne1,ne2 in
              zip(df["rule"], df["subreddit"],
                  df.get("positive_example_1",""), df.get("positive_example_2",""),
                  df.get("negative_example_1",""), df.get("negative_example_2",""))]
        h2 = [hyp_with_examples(r,s,pe1,pe2,ne1,ne2,negate=True) for r,s,pe1,pe2,ne1,ne2 in
              zip(df["rule"], df["subreddit"],
                  df.get("positive_example_1",""), df.get("positive_example_2",""),
                  df.get("negative_example_1",""), df.get("negative_example_2",""))]
    else:
        h1 = [hyp_with_examples(r, s, negate=False) for r,s in zip(df["rule"], df["subreddit"])]
        h2 = [hyp_with_examples(r, s, negate=True ) for r,s in zip(df["rule"], df["subreddit"])]

    L1 = nli_logits(nli, list(zip(prem, h1)), batch_size=batch_size)  # [N, C]
    L2 = nli_logits(nli, list(zip(prem, h2)), batch_size=batch_size)  # [N, C]
    e_idx, n_idx, c_idx = get_label_indices(nli)

    # Extract per-template logits
    e1, n1, c1 = L1[:, e_idx], L1[:, n_idx], L1[:, c_idx]
    e2, n2, c2 = L2[:, e_idx], L2[:, n_idx], L2[:, c_idx]

    # Simple, effective feature set (logit domain)
    X = np.column_stack([
        e1, n1, c1, e2, n2, c2,
        e1 - e2,               # entailment difference
        c2 - c1,               # contradiction supports "violate"
        (e1 + c2),             # symmetric support
        (e1 - c1) + (c2 - e2), # margin
    ])
    y = df["rule_violation"].astype(int).to_numpy() if "rule_violation" in df.columns else None
    return X, y


In [6]:
# Pick an NLI model (download once online; then save for offline)
# For English: "cross-encoder/nli-deberta-v3-base"
# For multilingual: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
NLI_PATH = NLI_BACKBONE  # or a local folder path for offline use

# 2a) Fit a tiny LR on your validation split
from sentence_transformers import CrossEncoder
nli = CrossEncoder(NLI_PATH)

X_val, y_val = build_nli_features(df_val, nli, batch_size=64, use_examples=True)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_val, y_val)

# Check AUCs
val_scores = lr.predict_proba(X_val)[:, 1]
print("VAL  ROC AUC:", roc_auc_score(y_val, val_scores))
print("VAL  PR  AUC:", average_precision_score(y_val, val_scores))

# 2b) Apply to TEST (no labels)
X_test, _ = build_nli_features(df_test, nli, batch_size=64, use_examples=True)
test_scores = lr.predict_proba(X_test)[:, 1]

sub = pd.DataFrame({"row_id": df_test["row_id"], "rule_violation": test_scores})
sub.to_csv("submission.csv", index=False)
print("submission.csv written:", sub.shape)


Batches: 100%|██████████| 32/32 [55:30<00:00, 104.07s/it]
Batches: 100%|██████████| 32/32 [1:08:17<00:00, 128.05s/it]


VAL  ROC AUC: 0.5855454847619584
VAL  PR  AUC: 0.5858657099022349


Batches: 100%|██████████| 1/1 [00:15<00:00, 15.65s/it]
Batches: 100%|██████████| 1/1 [00:16<00:00, 16.09s/it]

submission.csv written: (10, 2)



