In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score

trn = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/train.csv"
tst = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/test.csv"
df_trn = pd.read_csv(trn)
df_tst = pd.read_csv(tst)

In [2]:
trn_rows = []
for idx, row in df_trn.iterrows():
  trn_rows.append({
    'body': row['positive_example_1'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 1
  })

  trn_rows.append({
    'body': row['positive_example_2'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 1
  })

  trn_rows.append({
    'body': row['negative_example_1'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 0
  })

  trn_rows.append({
    'body': row['negative_example_2'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': 0
  })

trn_df = pd.DataFrame(trn_rows)

val_rows = []
for idx, row in df_trn.iterrows():
  val_rows.append({
    'body': row['body'],
    'rule': row['rule'],
    'subreddit': row['subreddit'],
    'label': row['rule_violation']
  })

val_df = pd.DataFrame(val_rows)

tst_rows = []
for idx, row in df_tst.iterrows():
  tst_rows.append({
    'body': row['body'],
    'rule': row['rule'],
    'subreddit': row['subreddit']
  })

tst_df = pd.DataFrame(tst_rows)
print (f'Train shape: {trn_df.shape}, Val shape: {val_df.shape}, Test shape: {tst_df.shape}')

Train shape: (8116, 4), Val shape: (2029, 4), Test shape: (10, 3)


In [3]:
import re
import numpy as np

# ---------- text normalization (urls -> LINK, unify synonyms) ----------
URL_RE     = re.compile(r'(?i)\b(?:https?://|www\.)[^\s)]+')
MD_LINK_RE = re.compile(r'\[([^\]]+)\]\((?:https?://|www\.)[^\s)]+\)')


def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    s = MD_LINK_RE.sub(r'\1 LINK', s)
    s = URL_RE.sub(' LINK ', s)
    s = re.sub(r'(?i)\burls?\b', ' LINK ', s)
    s = re.sub(r'(?i)\blinks?\b', ' LINK ', s)
    s = re.sub(r'(?i)\breferences?\b', ' REFERENCE ', s)
    s = re.sub(r'(?i)\bcitations?\b', ' REFERENCE ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s


def clean_text_col(s: pd.Series) -> pd.Series:
    return s.astype(str).fillna("").map(normalize_text)

# ---------- extra high-signal flags (link in body vs rule forbids) ----------
FORBID_RE = re.compile(
    r'(?i)\b(?:no|without|avoid|prohibit(?:ed)?)\b.*\b(?:LINK|REFERENCE|URL|CITATION|SOURCE)s?\b'
)


def extra_link_feats(df):
    body = df["body"].astype(str).fillna("").map(normalize_text)
    rule = df["rule"].astype(str).fillna("").map(normalize_text)

    has_link_body = body.str.contains(r'\bLINK\b', regex=True, na=False).astype(np.float32)
    link_count    = body.str.count(r'\bLINK\b', flags=re.I).astype(np.float32)
    has_ref_body  = body.str.contains(r'\bREFERENCE\b', regex=True, na=False).astype(np.float32)

    # note the non-capturing group and na=False
    rule_forbids  = rule.str.contains(FORBID_RE, regex=True, na=False).astype(np.float32)
    rule_mentions = rule.str.contains(r'\b(?:LINK|REFERENCE)\b', case=False, regex=True, na=False).astype(np.float32)

    violates      = (has_link_body * rule_forbids).astype(np.float32)

    return np.vstack([
        has_link_body.values,
        link_count.values,
        has_ref_body.values,
        rule_mentions.values,
        rule_forbids.values,
        violates.values,
    ]).T


# ---------- make train examples from original rows (no leakage) ----------
def make_examples(df_rows: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df_rows.iterrows():
        rows += [
            {"body": r["positive_example_1"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 1},
            {"body": r["positive_example_2"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 1},
            {"body": r["negative_example_1"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 0},
            {"body": r["negative_example_2"], "rule": r["rule"], "subreddit": r["subreddit"], "label": 0},
        ]
    return pd.DataFrame(rows)


In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

pipe_body = Pipeline([
    ("tfidf", TfidfVectorizer(
        preprocessor=normalize_text,
        ngram_range=(1,2), min_df=3, max_features=200_000,
        lowercase=True, sublinear_tf=True, norm=None  # try norm=None for trees
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42, n_iter=7)),
])

# RULE: union(word, char) -> SVD (boosts vocab so k can be >= 64)
rule_union = FeatureUnion([
    ("word", TfidfVectorizer(
        preprocessor=normalize_text,
        ngram_range=(1,2), min_df=1, lowercase=True, sublinear_tf=True,
        token_pattern=r"(?u)\b\w+\b", norm=None
    )),
    ("char", TfidfVectorizer(
        preprocessor=normalize_text,
        analyzer="char_wb", ngram_range=(3,6), min_df=1, sublinear_tf=True, norm=None
    )),
    # transformer_weights={"word":1.0, "char":0.8},  # optional downweight chars
])

pipe_rule = Pipeline([
    ("union", rule_union),
    ("svd", TruncatedSVD(n_components=128, random_state=42, n_iter=7)),
])

# pipe_sred = Pipeline([
#     # Subreddit names are short; unigrams usually suffice
#     ("tfidf", TfidfVectorizer(lowercase=False)),
#     ("svd",   TruncatedSVD(n_components=16, random_state=42, n_iter=7)),
#     # ("norm",  Normalizer(copy=False))
# ])

enc_sred = OneHotEncoder(handle_unknown="ignore", sparse_output=False)


In [5]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import lightgbm as lgb

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof = np.zeros(len(df_trn), dtype=float)
fold_test_preds = []
fold_aucs = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(df_trn, df_trn["rule_violation"])):
    tr_rows = df_trn.iloc[tr_idx].copy()
    va_rows = df_trn.iloc[va_idx].copy()

    trn_df = make_examples(tr_rows)
    val_df = va_rows[["body","rule","subreddit","rule_violation"]].rename(columns={"rule_violation":"label"})
    tst_df = df_tst[["body","rule","subreddit"]].copy()

    # Fit per-field on TRAIN only
    Zb_trn = pipe_body.fit_transform(trn_df["body"])
    Zb_val = pipe_body.transform(val_df["body"])
    Zb_tst = pipe_body.transform(tst_df["body"])

    Zr_trn = pipe_rule.fit_transform(trn_df["rule"])
    Zr_val = pipe_rule.transform(val_df["rule"])
    Zr_tst = pipe_rule.transform(tst_df["rule"])

    Zs_trn = enc_sred.fit_transform(trn_df[["subreddit"]])
    Zs_val = enc_sred.transform(val_df[["subreddit"]])
    Zs_tst = enc_sred.transform(tst_df[["subreddit"]])

    # Optional: upweight RULE block slightly
    rule_w = 1.3
    Zr_trn *= rule_w; Zr_val *= rule_w; Zr_tst *= rule_w

    # Extra numeric flags
    F_trn = extra_link_feats(trn_df)
    F_val = extra_link_feats(val_df)
    F_tst = extra_link_feats(tst_df)

    # Concatenate blocks (no global L2)
    X_trn = np.hstack([Zb_trn, Zr_trn, Zs_trn, F_trn])
    X_val = np.hstack([Zb_val, Zr_val, Zs_val, F_val])
    X_tst = np.hstack([Zb_tst, Zr_tst, Zs_tst, F_tst])

    y_trn = trn_df["label"].values.astype(int)
    y_val = val_df["label"].values.astype(int)

    # LightGBM (slightly regularized for dense SVD features)
    params = {
        "objective": "binary",
        "metric": "auc",
        "random_state": 42,
        "n_estimators": 4096,
        "learning_rate": 0.012,
        "num_leaves": 31,
        "min_child_samples": 40,
        "lambda_l2": 2.0,
        "feature_fraction": 0.85,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
        "force_col_wise": True,
        # "verbosity": -1,
    }
    dtrn = lgb.Dataset(X_trn, label=y_trn)
    dval = lgb.Dataset(X_val, label=y_val)
    model = lgb.train(params, dtrn, num_boost_round=4096, valid_sets=[dval],
                      callbacks=[lgb.early_stopping(stopping_rounds=64)])

    # Store OOF preds mapped back to original rows (val fold has one row per original)
    preds_val = model.predict(X_val, num_iteration=model.best_iteration)
    oof[va_idx] = preds_val
    fold_auc = roc_auc_score(y_val, preds_val)
    fold_aucs.append(fold_auc)
    print(f"[fold {fold}] AUC: {fold_auc:.6f}  | best_iter: {model.best_iteration}")

    # Test predictions for this fold
    fold_test_preds.append(model.predict(X_tst, num_iteration=model.best_iteration))

# Overall CV
cv_auc = roc_auc_score(df_trn["rule_violation"].values.astype(int), oof)
print(f"CV AUC (10-fold): {cv_auc:.6f} | per-fold: {[round(a,6) for a in fold_aucs]}")

# Ensemble test preds
pred_test = np.mean(fold_test_preds, axis=0)
sub_df = pd.DataFrame({"row_id": df_tst["row_id"], "rule_violation": pred_test})
sub_df.to_csv("submission_cv_ens.csv", index=False)
print("submission_cv_ens.csv written; shape:", sub_df.shape)
print(sub_df.head(10))

[LightGBM] [Info] Number of positive: 3652, number of negative: 3652
[LightGBM] [Info] Total Bins 76793
[LightGBM] [Info] Number of data points in the train set: 7304, number of used features: 475
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 64 rounds
Early stopping, best iteration is:
[46]	valid_0's auc: 0.921262
[fold 0] AUC: 0.921262  | best_iter: 46
[LightGBM] [Info] Number of positive: 3652, number of negative: 3652
[LightGBM] [Info] Total Bins 76793
[LightGBM] [Info] Number of data points in the train set: 7304, number of used features: 472
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 64 rounds
Early stopping, best iteration is:
[166]	valid_0's auc: 0.915146
[fold 1] AUC: 0.915146  | best_iter: 166
[LightGBM] [Info] Number of positive: 3652, number of negative: 3652
[LightGBM] [Info] Total Bins 76803
[LightGBM] [