#### Importing libraries

In [2]:
import re
import numpy as np
import pandas as pd
from scipy.sparse import hstack

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [4]:
from xgboost import XGBClassifier

#### Loading Data

In [5]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

#### Remove white spaces

In [6]:
def clean(text):
    cl_text = text.astype(str).str.strip()
    return cl_text.apply(lambda x: re.sub(r"\s+", " ", x))


In [7]:
# Concatenate comment + rule so features can “see” both sides
train_text = clean(train["body"]) + " [RULE] " + clean(train["rule"])
test_text  = clean(test["body"])  + " [RULE] " + clean(test["rule"])

In [8]:
y = train["rule_violation"].astype(int).values

#### Defining TF-IDF vectorization hyperparameters

In [9]:
WORD_NGRAMS = (1,2) # Use unigrams (single words) and bigrams
MIN_DF = 2 # Ignore terms that occur in fewer than 2 documents
MAX_DF = 0.97 # Ignore terms that appear in more than 97% of documents.
MAXF_WORD = 150_000 # Cap the vocabulary size for word-based TF-IDF features to 150k most
K_SELECT = 200_000 # After combining word, keep only the top 200k most discriminative features using χ² (Chi-squared) selection.
C_GRID = [1.0, 3.0, 6.0]  # Regularization strengths for Logistic Regression
N_SPLITS = 5 # Number of folds for cross-validation.
SEED = 42

#### Defining XGBOOST parameters

In [10]:
# XGBoost params kept simple & robust for sparse TF‑IDF
XGB_PARAMS = dict(
    n_estimators=400,            # enough trees; early stopping will cut this if needed
    learning_rate=0.08,          # conservative LR
    max_depth=6,                 # moderate depth
    subsample=0.8,               # regularization
    colsample_bytree=0.8,        # regularization
    reg_lambda=1.0,              # L2
    eval_metric="auc",           # AUC across folds
    tree_method="hist",          # fast CPU histogram algorithm
    random_state=SEED,
    n_jobs=-1
)


#### Defining 2 feature variants to compare under CV protocol

In [11]:
variants = {
    "A_word_only_noSW": {"stopwords": None},
    "B_word_only_SW"  : {"stopwords": "english"},
}

In [12]:
def build_vectors(stopwords):

    # Word TF‑IDF
    word_vec = TfidfVectorizer(
        analyzer="word", 
        ngram_range=WORD_NGRAMS,
        min_df=MIN_DF, 
        max_df=MAX_DF, 
        max_features=MAXF_WORD,
        lowercase=True, 
        strip_accents="unicode", 
        sublinear_tf=True,
        stop_words=stopwords
    )

    Xw_tr = word_vec.fit_transform(train_text)
    Xw_te = word_vec.transform(test_text)

    # Stack channels horizontally → [word]
    X_tr = hstack([Xw_tr], format="csr")
    X_te = hstack([Xw_te], format="csr")
    return X_tr, X_te

def select_top_k(X_tr, X_te):
    """χ²: keep top‑K label‑associated features (works with non‑negative TF‑IDF)."""
    k_eff = min(K_SELECT, X_tr.shape[1])         # don’t request more than exist
    selector = SelectKBest(chi2, k=k_eff)        # fit on TRAIN only
    Xtr_sel = selector.fit_transform(X_tr, y)    # select best columns on train
    Xte_sel = selector.transform(X_te)           # apply same mask to test
    return Xtr_sel, Xte_sel

def cv_auc_xgb(X):
    """5‑fold CV for XGBClassifier with early stopping; return mean/std AUC and best iter per fold."""
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    scores, best_iters = [], []
    for tr_idx, va_idx in skf.split(X, y):
        model = XGBClassifier(**XGB_PARAMS)
        # train with early stopping on the validation fold (50 rounds patience)
        model.fit(
            X[tr_idx], y[tr_idx],
            eval_set=[(X[va_idx], y[va_idx])],
            verbose=False
        )
        p = model.predict_proba(X[va_idx])[:, 1]
        scores.append(roc_auc_score(y[va_idx], p))
        # model.best_iteration is set after early stopping; fallback to n_estimators if None
        best_iters.append(int(getattr(model, "best_iteration", XGB_PARAMS["n_estimators"])))
    return {
        "auc_mean": float(np.mean(scores)),
        "auc_std":  float(np.std(scores)),
        "best_iters": best_iters
    }

def evaluate_variant(name, stopwords):
    """Vectorize → χ² select → CV XGBoost → return results and selected features for final fit."""
    # 1) Vectorize
    X_tr, X_te = build_vectors(stopwords)

    # 2) χ² select
    Xtr_sel, Xte_sel = select_top_k(X_tr, X_te)

    # 3) CV with XGBoost
    res = cv_auc_xgb(Xtr_sel)
    print(f"[{name}] XGB mean AUC={res['auc_mean']:.4f} (±{res['auc_std']:.4f}); "
          f"best iters per fold: {res['best_iters']}")
    # keep artifacts for winner
    res.update({
        "name": name,
        "Xtr_sel": Xtr_sel,
        "Xte_sel": Xte_sel,
        "stopwords": stopwords,
    })
    return res









# ---- Evaluate all variants; pick the best by mean CV AUC ----
results = [evaluate_variant(n, v["stopwords"]) for n, v in variants.items()]
winner = max(results, key=lambda r: r["auc_mean"])
print(f"\n>>> WINNER: {winner['name']} | XGB CV mean AUC = {winner['auc_mean']:.4f} (±{winner['auc_std']:.4f})")

# ---- Train final XGBoost on FULL train (use median of best_iterations from CV) ----
best_n_estimators = int(np.median(winner["best_iters"])) if len(winner["best_iters"]) > 0 else XGB_PARAMS["n_estimators"]
final_params = {**XGB_PARAMS, "n_estimators": max(best_n_estimators, 50)}  # ensure >0 trees

final = XGBClassifier(**final_params)
final.fit(winner["Xtr_sel"], y, verbose=False)             # fit once on all train features
test_prob = final.predict_proba(winner["Xte_sel"])[:, 1]   # probability of rule_violation

# ---- Build submission: row_id + probability ----
submission = pd.DataFrame({
    "row_id": test["row_id"], 
    "rule_violation": test_prob
})
out_path = "../outputs/sub_v2_tfidf_xgb.csv"
submission.to_csv(out_path, index=False)
print(f"Saved: {out_path}")
print(submission.head())

[A_word_only_noSW] XGB mean AUC=0.7825 (±0.0179); best iters per fold: [400, 400, 400, 400, 400]
[B_word_only_SW] XGB mean AUC=0.7937 (±0.0185); best iters per fold: [400, 400, 400, 400, 400]

>>> WINNER: B_word_only_SW | XGB CV mean AUC = 0.7937 (±0.0185)
Saved: ../outputs/sub_v2_tfidf_xgb.csv
   row_id  rule_violation
0    2029        0.142839
1    2030        0.195030
2    2031        0.839390
3    2032        0.480992
4    2033        0.787956
