In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score

trn = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/train.csv"
tst = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/test.csv"
df_trn = pd.read_csv(trn)
df_tst = pd.read_csv(tst)

In [2]:
trn_rows = []
for idx, row in df_trn.iterrows():
  trn_rows.append({
    'body': row['positive_example_1'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 1
  })

  trn_rows.append({
    'body': row['positive_example_2'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 1
  })

  trn_rows.append({
    'body': row['negative_example_1'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 0
  })

  trn_rows.append({
    'body': row['negative_example_2'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 0
  })

trn_df = pd.DataFrame(trn_rows)

val_rows = []
for idx, row in df_trn.iterrows():
  val_rows.append({
    'body': row['body'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': row['rule_violation']
  })

val_df = pd.DataFrame(val_rows)

tst_rows = []
for idx, row in df_tst.iterrows():
  tst_rows.append({
    'body': row['body'],
    'rule': row['rule'],
    'subreddit': row['subreddit']
  })

tst_df = pd.DataFrame(tst_rows)
print (f'Train shape: {trn_df.shape}, Val shape: {val_df.shape}, Test shape: {tst_df.shape}')

Train shape: (8116, 4), Val shape: (2029, 4), Test shape: (10, 3)


In [3]:
import re
import numpy as np

# ---------- text normalization (urls -> LINK, unify synonyms) ----------
URL_RE     = re.compile(r'(?i)\b(?:https?://|www\.)[^\s)]+')
MD_LINK_RE = re.compile(r'\[([^\]]+)\]\((?:https?://|www\.)[^\s)]+\)')


def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    s = MD_LINK_RE.sub(r'\1 LINK', s)
    s = URL_RE.sub(' LINK ', s)
    s = re.sub(r'(?i)\burls?\b', ' LINK ', s)
    s = re.sub(r'(?i)\blinks?\b', ' LINK ', s)
    s = re.sub(r'(?i)\breferences?\b', ' REFERENCE ', s)
    s = re.sub(r'(?i)\bcitations?\b', ' REFERENCE ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s


def clean_text_col(s: pd.Series) -> pd.Series:
    return s.astype(str).fillna("").map(normalize_text)

# ---------- extra high-signal flags (link in body vs rule forbids) ----------
FORBID_RE = re.compile(
    r'(?i)\b(?:no|without|avoid|prohibit(?:ed)?)\b.*\b(?:LINK|REFERENCE|URL|CITATION|SOURCE)s?\b'
)


def extra_link_feats(df):
    body = df["body"].astype(str).fillna("").map(normalize_text)
    rule = df["rule"].astype(str).fillna("").map(normalize_text)

    has_link_body = body.str.contains(r'\bLINK\b', regex=True, na=False).astype(np.float32)
    link_count    = body.str.count(r'\bLINK\b', flags=re.I).astype(np.float32)
    has_ref_body  = body.str.contains(r'\bREFERENCE\b', regex=True, na=False).astype(np.float32)

    # note the non-capturing group and na=False
    rule_forbids  = rule.str.contains(FORBID_RE, regex=True, na=False).astype(np.float32)
    rule_mentions = rule.str.contains(r'\b(?:LINK|REFERENCE)\b', case=False, regex=True, na=False).astype(np.float32)

    violates      = (has_link_body * rule_forbids).astype(np.float32)

    return np.vstack([
        has_link_body.values,
        link_count.values,
        has_ref_body.values,
        rule_mentions.values,
        rule_forbids.values,
        violates.values,
    ]).T


# ---------- make train examples from original rows (no leakage) ----------
def make_examples(df_rows, include_orig=False, orig_weight=3.0):
    rows, weights = [], []
    for _, r in df_rows.iterrows():
        # curated
        rows += [
          {"body": r["positive_example_1"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 1},
          {"body": r["positive_example_2"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 1},
          {"body": r["negative_example_1"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 0},
          {"body": r["negative_example_2"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 0},
        ]
        weights += [1.0, 1.0, 1.0, 1.0]
        # original body from the same row
        if include_orig:
            rows.append({"body": r["body"], "rule": r["rule"], "subreddit": r["subreddit"],
                         "label": int(r["rule_violation"])})
            weights.append(orig_weight)  # upweight to counter the 4 curated per row
    df = pd.DataFrame(rows)
    return df, np.asarray(weights, dtype=np.float32)

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

pipe_body = Pipeline([
    ("tfidf", TfidfVectorizer(
        preprocessor=normalize_text,
        ngram_range=(1,2), min_df=3, max_features=200_000,
        lowercase=True, sublinear_tf=True, norm=None  # try norm=None for trees
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42, n_iter=7)),
])

# RULE: union(word, char) -> SVD (boosts vocab so k can be >= 64)
rule_union = FeatureUnion([
    ("word", TfidfVectorizer(
        preprocessor=normalize_text,
        ngram_range=(1,2), min_df=1, lowercase=True, sublinear_tf=True,
        token_pattern=r"(?u)\b\w+\b", norm=None
    )),
    ("char", TfidfVectorizer(
        preprocessor=normalize_text,
        analyzer="char_wb", ngram_range=(3,6), min_df=1, sublinear_tf=True, norm=None
    )),
    # transformer_weights={"word":1.0, "char":0.8},  # optional downweight chars
])

pipe_rule = Pipeline([
    ("union", rule_union),
    ("svd", TruncatedSVD(n_components=128, random_state=42, n_iter=7)),
])

# pipe_sred = Pipeline([
#     # Subreddit names are short; unigrams usually suffice
#     ("tfidf", TfidfVectorizer(lowercase=False)),
#     ("svd",   TruncatedSVD(n_components=16, random_state=42, n_iter=7)),
#     # ("norm",  Normalizer(copy=False))
# ])

enc_sred = OneHotEncoder(handle_unknown="ignore", sparse_output=False)


In [5]:
# ==== RECC helpers: shared LSA + per-rule prototype bank ====
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

def fit_shared_lsa(corpus_texts, n_components=256, random_state=42):
    vec = TfidfVectorizer(
        preprocessor=normalize_text,
        ngram_range=(1,2), min_df=2,
        sublinear_tf=True, lowercase=False, norm=None
    )
    X = vec.fit_transform(corpus_texts)
    svd = TruncatedSVD(n_components=n_components, random_state=random_state, n_iter=7)
    Z = svd.fit_transform(X)
    norm = Normalizer(copy=False)  # L2 so dot ~ cosine
    norm.fit(Z)
    return vec, svd, norm

def lsa_embed(texts, vec, svd, norm):
    X = vec.transform(texts)
    Z = svd.transform(X)
    return norm.transform(Z)

def build_proto_bank(tr_rows):
    """
    Build per-rule positive/negative 'prototype' texts from the TRAIN fold only.
    Use normalized rule text as the key so matching is robust.
    """
    bank = {}
    for _, r in tr_rows.iterrows():
        key = normalize_text(str(r["rule"]))
        b = bank.setdefault(key, {"pos": [], "neg": [], "rule": str(r["rule"])})
        # collect examples if present
        for c in ("positive_example_1","positive_example_2"):
            if c in r and isinstance(r[c], str) and r[c].strip():
                b["pos"].append(r[c])
        for c in ("negative_example_1","negative_example_2"):
            if c in r and isinstance(r[c], str) and r[c].strip():
                b["neg"].append(r[c])
    return bank

def preembed_bank(bank, vec, svd, norm):
    """
    Pre-embed all rule texts and their prototypes once.
    """
    bank_emb = {}
    for key, d in bank.items():
        Er = lsa_embed([d["rule"]], vec, svd, norm)              # [1, d]
        Epos = lsa_embed(d["pos"], vec, svd, norm) if d["pos"] else np.zeros((0, Er.shape[1]))
        Eneg = lsa_embed(d["neg"], vec, svd, norm) if d["neg"] else np.zeros((0, Er.shape[1]))
        bank_emb[key] = {"Er": Er, "Epos": Epos, "Eneg": Eneg}
    return bank_emb

def recc_block(df, bank_emb, vec, svd, norm, topk=2):
    """
    Compute RECC features for each row using the bank from TRAIN fold.
    For rows whose rule isn't in the bank, we fallback to empty Epos/Eneg and embed rule on the fly.
    """

    # work on a copy with 0..N-1 index to keep positional alignment
    df_ = df.reset_index(drop=True)

    bodies_norm = [normalize_text(x) for x in df_["body"].astype(str).fillna("")]
    Eb_all = lsa_embed(bodies_norm, vec, svd, norm)  # shape [N, d]

    rule_cache = {}
    feats = []

    for j, row in df_.iterrows():        # j is 0..N-1
        Eb = Eb_all[j:j+1]               # [1, d]
        key = normalize_text(str(row["rule"]))

        if key in bank_emb:
            Er   = bank_emb[key]["Er"]   # [1, d]
            Epos = bank_emb[key]["Epos"] # [P, d]
            Eneg = bank_emb[key]["Eneg"] # [N, d]
        else:
            if key not in rule_cache:
                rule_cache[key] = lsa_embed([str(row["rule"])], vec, svd, norm)
            Er = rule_cache[key]         # [1, d]
            Epos = np.zeros((0, Er.shape[1]))
            Eneg = np.zeros((0, Er.shape[1]))

        # cosines
        s_br = float((Eb @ Er.T)[0, 0])

        if Epos.shape[0]:
            sims = (Eb @ Epos.T).ravel()
            s_bp_pos_max  = float(sims.max())
            k = min(topk, len(sims))
            s_bp_pos_topk = float(np.mean(np.sort(sims)[-k:]))
        else:
            s_bp_pos_max = s_bp_pos_topk = -1.0

        s_bp_neg_max = float((Eb @ Eneg.T).max()) if Eneg.shape[0] else -1.0
        margin_pos_minus_neg = s_bp_pos_max - max(s_bp_neg_max, -1.0)

        feats.append([s_br, s_bp_pos_max, s_bp_pos_topk, s_bp_neg_max, margin_pos_minus_neg])

    cols = [
        "recc_cos_body_rule",
        "recc_cos_body_pos_max",
        "recc_cos_body_pos_topk",
        "recc_cos_body_neg_max",
        "recc_margin_pos_minus_neg",
    ]

    return pd.DataFrame(feats, columns=cols, index=df.index)  # preserve original index


In [6]:
from sklearn.model_selection import StratifiedGroupKFold
import numpy as np
import lightgbm as lgb

groups = pd.factorize(df_trn["subreddit"].astype(str).str.lower().str.strip())[0]
cv = StratifiedGroupKFold(n_splits=min(5, len(np.unique(groups))), shuffle=True, random_state=42)

oof = np.zeros(len(df_trn), dtype=float)
fold_test_preds = []
fold_aucs = []

# for fold, (tr_idx, va_idx) in enumerate(gkf.split(df_trn, df_trn["rule_violation"], groups=df_trn["rule"])):
fold = 0
for tr_idx, va_idx in cv.split(df_trn, df_trn["rule_violation"], groups):
    fold += 1
    tr_rows, va_rows = df_trn.iloc[tr_idx], df_trn.iloc[va_idx]
    trn_df, w_trn = make_examples(tr_rows, include_orig=True, orig_weight=3.0)

    trn_df, w_trn = make_examples(tr_rows, include_orig=True, orig_weight=3.0)
    val_df = va_rows[["body","rule","subreddit","rule_violation"]].rename(columns={"rule_violation":"label"})
    tst_df = df_tst[["body","rule","subreddit"]].copy()

    # Fit per-field on TRAIN only
    Zb_trn = pipe_body.fit_transform(trn_df["body"])
    Zb_val = pipe_body.transform(val_df["body"])
    Zb_tst = pipe_body.transform(tst_df["body"])

    Zr_trn = pipe_rule.fit_transform(trn_df["rule"])
    Zr_val = pipe_rule.transform(val_df["rule"])
    Zr_tst = pipe_rule.transform(tst_df["rule"])

    Zs_trn = enc_sred.fit_transform(trn_df[["subreddit"]])
    Zs_val = enc_sred.transform(val_df[["subreddit"]])
    Zs_tst = enc_sred.transform(tst_df[["subreddit"]])

    # Optional: upweight RULE block slightly
    rule_w = 1.3
    Zr_trn *= rule_w; Zr_val *= rule_w; Zr_tst *= rule_w

    # Extra numeric flags
    F_trn = extra_link_feats(trn_df)
    F_val = extra_link_feats(val_df)
    F_tst = extra_link_feats(tst_df)

    # ... inside your CV loop, after you’ve built tr_rows, va_rows, trn_df/val_df/tst_df ...

    # --- Build prototype bank from TRAIN-FOLD ONLY (original rows) ---
    proto_bank = build_proto_bank(tr_rows)

    # --- Fit shared LSA on TRAIN-FOLD ONLY ---
    # Use a rich but leakage-safe corpus: train rules + all train prototypes + (optionally) train bodies
    lsa_corpus = []
    lsa_corpus += [normalize_text(x) for x in tr_rows["rule"].astype(str)]
    for d in proto_bank.values():
        lsa_corpus += [normalize_text(x) for x in d["pos"]]
        lsa_corpus += [normalize_text(x) for x in d["neg"]]
    # (optional) also include the train bodies
    lsa_corpus += [normalize_text(x) for x in tr_rows["body"].astype(str)]

    vec_lsa, svd_lsa, norm_lsa = fit_shared_lsa(lsa_corpus, n_components=256, random_state=42)

    # --- Pre-embed the bank once ---
    bank_emb = preembed_bank(proto_bank, vec_lsa, svd_lsa, norm_lsa)

    # --- RECC blocks (dense) for this fold ---
    R_trn = recc_block(trn_df, bank_emb, vec_lsa, svd_lsa, norm_lsa, topk=2)
    R_val = recc_block(val_df, bank_emb, vec_lsa, svd_lsa, norm_lsa, topk=2)

    # Test: you can only use what the bank knows. If a test rule isn't in the bank,
    # it falls back to empty prototypes (features become conservative).
    R_tst = recc_block(tst_df, bank_emb, vec_lsa, svd_lsa, norm_lsa, topk=2)

    # --- Concatenate with your existing blocks ---
    X_trn = np.hstack([Zb_trn, Zr_trn, Zs_trn, F_trn, R_trn.values])
    X_val = np.hstack([Zb_val, Zr_val, Zs_val, F_val, R_val.values])
    X_tst = np.hstack([Zb_tst, Zr_tst, Zs_tst, F_tst, R_tst.values])

    # # Concatenate blocks (no global L2)
    # X_trn = np.hstack([Zb_trn, Zr_trn, Zs_trn, F_trn])
    # X_val = np.hstack([Zb_val, Zr_val, Zs_val, F_val])
    # X_tst = np.hstack([Zb_tst, Zr_tst, Zs_tst, F_tst])

    y_trn = trn_df["label"].values.astype(int)
    y_val = val_df["label"].values.astype(int)

    # LightGBM (slightly regularized for dense SVD features)
    params = {
        "objective": "binary",
        "metric": "auc",
        "random_state": 42,
        "n_estimators": 4096,
        "learning_rate": 0.012,
        "num_leaves": 31,
        "min_child_samples": 40,
        "lambda_l2": 2.0,
        "feature_fraction": 0.85,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
        "force_col_wise": True,
        "verbosity": -1,
    }

    dtrn = lgb.Dataset(X_trn, label=trn_df["label"].values, weight=w_trn)
    dval = lgb.Dataset(X_val, label=val_df["label"].values)
    model = lgb.train(params, dtrn, num_boost_round=8192, valid_sets=[dtrn, dval], valid_names=["train", "val"],
                        callbacks=[lgb.early_stopping(stopping_rounds=256), lgb.log_evaluation(64)]
    )

    # Store OOF preds mapped back to original rows (val fold has one row per original)
    preds_val = model.predict(X_val, num_iteration=model.best_iteration)
    oof[va_idx] = preds_val
    fold_auc = roc_auc_score(y_val, preds_val)
    fold_aucs.append(fold_auc)
    print(f"[fold {fold}] AUC: {fold_auc:.6f}  | best_iter: {model.best_iteration}")

    # Test predictions for this fold
    fold_test_preds.append(model.predict(X_tst, num_iteration=model.best_iteration, raw_score=True))

# Overall CV
cv_auc = roc_auc_score(df_trn["rule_violation"].values.astype(int), oof)
print(f"CV AUC (10-fold): {cv_auc:.6f} | per-fold: {[round(a,6) for a in fold_aucs]}")

# Ensemble test preds
pred_test = np.mean(fold_test_preds, axis=0)
preds = 1.0 / (1.0 + np.exp(-pred_test))
sub_df = pd.DataFrame({"row_id": df_tst["row_id"], "rule_violation": preds})
sub_df.to_csv("submission.csv", index=False)
print("submission.csv written; shape:", sub_df.shape)
print(sub_df.head(10))

Training until validation scores don't improve for 256 rounds
[64]	train's auc: 0.995169	val's auc: 0.904088
[128]	train's auc: 0.997599	val's auc: 0.905112
[192]	train's auc: 0.998438	val's auc: 0.902739
[256]	train's auc: 0.998718	val's auc: 0.902142
[320]	train's auc: 0.998905	val's auc: 0.902502
Early stopping, best iteration is:
[127]	train's auc: 0.99759	val's auc: 0.905174
[fold 1] AUC: 0.905174  | best_iter: 127
Training until validation scores don't improve for 256 rounds
[64]	train's auc: 0.995854	val's auc: 0.954596
[128]	train's auc: 0.99692	val's auc: 0.954188
[192]	train's auc: 0.99777	val's auc: 0.953935
[256]	train's auc: 0.998409	val's auc: 0.953799
Early stopping, best iteration is:
[13]	train's auc: 0.993948	val's auc: 0.956354
[fold 2] AUC: 0.956354  | best_iter: 13
Training until validation scores don't improve for 256 rounds
[64]	train's auc: 0.995971	val's auc: 0.863557
[128]	train's auc: 0.99724	val's auc: 0.862852
[192]	train's auc: 0.99812	val's auc: 0.857895
