#### Importing libraries

In [88]:
import os, json, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
from pathlib import Path

#### Defining config constants

In [89]:
RANDOM_STATE = 42 # Sets a fixed random seed so results are reproducible
N_SPLITS = 5 # Number of folds for K-Fold Cross Validation
MAX_FEATURES = 100_000 # Maximum number of features (words or tokens) the TF-IDF vectorizer will keep
LR_C = 4.0 # A hyperparameter for Logistic Regression that controls regularization strength.

#### Loading data

In [90]:
data = Path("../data")
train_path = data/"train.csv"
test_path = data/"test.csv"
sample_submission_path = data/"sample_submission.csv"

In [91]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

In [92]:
train.shape

(2029, 9)

#### Checking columns in ***train*** dataset

In [93]:
train.columns

Index(['row_id', 'body', 'rule', 'subreddit', 'positive_example_1',
       'positive_example_2', 'negative_example_1', 'negative_example_2',
       'rule_violation'],
      dtype='object')

#### Build modeling text

In [94]:
def build_text(df):
    return (df["body"].astype(str) + " [RULE] " + df["rule"].astype(str)).str.strip()

train["comment_text"] = build_text(train)
test["comment_text"] = build_text(test)

#### Setting ***rule_violation*** as target label

<ul>
    <li>Ensuring the data type of target variable is integer</li>
    <li>Extracts the column as a NumPy array</li>
</ul>

In [95]:
target = train["rule_violation"].astype(int).values

#### Checking the comment_text and rule_violation in train dataset

In [96]:
train[["comment_text", "rule_violation"]].head()

Unnamed: 0,comment_text,rule_violation
0,Banks don't want you to know this! Click here ...,0
1,SD Stream [ ENG Link 1] (http://www.sportsstre...,0
2,Lol. Try appealing the ban and say you won't d...,1
3,she will come your home open her legs with an...,1
4,code free tyrande --->>> [Imgur](http://i.imgu...,1


#### TF-IDF with 1-2 grams

In [97]:
vectorizer = TfidfVectorizer(
    ngram_range = (1,2),
    max_features = MAX_FEATURES,
    min_df = 2,
    strip_accents = "unicode",
    lowercase = True,
    sublinear_tf = True

)

#### fit the vectorizer on train, transform test

In [98]:
# fit_transform --> Learn vocabulary + transform
# fit looks at all the training text and learns the vocabulary & statistics it needs.
# transform → Converts each training text into its TF-IDF vector using that learned vocabulary.
X = vectorizer.fit_transform(train["comment_text"].values)


In [99]:
# transform --> Reuse learned vocabulary; no leakage
# Use the vocabulary learned from training to convert test comments into vectors.
X_test = vectorizer.transform(test["comment_text"].values)
feature_names = np.array(vectorizer.get_feature_names_out()) 

In [100]:
X.shape

(2029, 10252)

In [101]:
X_test.shape

(10, 10252)

#### K-Fold Split

<ul>
    <li>Create a K-fold splitter with the preservation of class ratio in rule_violations</li>
    <li>n_splits define number of folds</li>
</ul>

In [102]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

In [103]:

# Pre-allocate an array for "out-of-fold" predictions (OOF) for every train row.
# We'll fill the validation predictions for each fold at the appropriate indices.
oof = np.zeros(len(train), dtype=float)

In [104]:
# Lists to collect per-fold AUCs and ROC curves for later reporting/plotting
fold_aucs, roc_curves = [], []

In [105]:
# Loop over each fold - skf.split(X,y) yields (train_indices, valid_indices) for that fold.

for fold, (tr_idx, va_idx) in enumerate(skf.split(X,target), 1):
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = target[tr_idx], target[va_idx]

    # Define a simple logistic Regression classifier for sparse data

    clf = LogisticRegression(
        solver="liblinear",
        C=LR_C,
        max_iter=200,
        random_state = RANDOM_STATE
    )

    # Fit the model on this fold's training split
    clf.fit(X_tr, y_tr)

    # Get predicted probabilities for the positive class on the validation split
    va_prob = clf.predict_proba(X_va)[:, 1]

    # Store the validation predictions into the OOF array at the proper indices
    # After the loop, oof contains a prediction for every training row 
    oof[va_idx] = va_prob

    # Compute ROC-AUC for this fold's validation set 
    auc = roc_auc_score(y_va, va_prob)

    # Also compute the ROC curve points (FPR, TPR) to visualize later if desired
    fpr, tpr, _ = roc_curve(y_va, va_prob)

    # Keep the fold’s AUC and ROC points for reporting/plots
    fold_aucs.append(float(auc))
    roc_curves.append({"fold": fold, "fpr": fpr.tolist(), "tpr": tpr.tolist(), "auc": float(auc)})
    
    # Print a quick fold-level score for visibility during training
    print(f"[Fold {fold}] AUC = {auc:.4f}")

[Fold 1] AUC = 0.8381
[Fold 2] AUC = 0.7757
[Fold 3] AUC = 0.8294
[Fold 4] AUC = 0.8476
[Fold 5] AUC = 0.7912


In [106]:
cv_mean, cv_std = float(np.mean(fold_aucs)), float(np.std(fold_aucs))
oof_auc = float(roc_auc_score(target, oof))
print(f"\nCV mean AUC = {cv_mean:.3f} ± {cv_std:.3f}")
print(f"OOF AUC     = {oof_auc:.3f}")



CV mean AUC = 0.816 ± 0.028
OOF AUC     = 0.815


#### Train on full train & predict test

In [107]:
clf_full = LogisticRegression(
    solver="liblinear",
    C=LR_C,
    max_iter=200,
    random_state=RANDOM_STATE
)
clf_full.fit(X, target)

test_prob = clf_full.predict_proba(X_test)[:, 1]
test_prob[:5], test_prob.min(), test_prob.max()


(array([0.25230146, 0.49921108, 0.72459443, 0.75605797, 0.82902639]),
 np.float64(0.084185754265707),
 np.float64(0.8422189769754849))

In [108]:
# Create submission DataFrame

submission = pd.DataFrame({
    "row_id": test["row_id"],        # use row_id from test dataset
    "rule_violation": test_prob      # predictions from model
})

In [110]:
submission.to_csv("submission.csv", index=False)