#### Import Libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

#### Import Data

In [48]:
data = Path("../data")

In [49]:
train_path = data/"train.csv"
test_path = data/"test.csv"
sample_submission_path = data/"sample_submission.csv"

In [50]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

#### Setting Hyperparameters

In [51]:
MODEL_NAME = "distilbert-base-uncased"  # Pretrained model from the HuggingFace Hub
MAX_LEN = 256                           # Sequence length for the tokenizer
RANDOM_SEED = 42

#### Preparing Input 

In [None]:
from typing import List, Dict, Tuple

def _clean(s): # Returns a trimmed string; if the value is missing/NaN it returns ""
    if pd.isna(s):
        return ""
    return str(s).strip()

def _to_list(*vals) -> List[str]:  # Accepts any number of arguments & Returns a list of examples
    out = []
    for v in vals:
        v = _clean(v)
        if v:
            out.append(v)
    return out

# Defines the layout to the tokenizer/model
TEMPLATE_RULE = "RULE: {rule}"
TEMPLATE_POS = "POS: {examples}"
TEMPLATE_NEG = "NEG: {examples}"
TEMPLATE_COMMENT = "COMMENT: {comment}"

def join_examples(examples: List[str], max_examples: int) -> str:
    return " | ".join(ex.strip() for ex in examples[:max_examples] if ex and ex.strip())

def shorten_text_by_chars(txt: str, max_chars: int) -> str:
    if len(txt) <= max_chars:
        return txt
    head = max_chars // 2
    tail = max_chars - head - 3
    return txt[:head] + "..." + txt[-tail:]

def make_segments(rule: str,
                  pos: List[str],
                  neg: List[str],
                  comment: str,
                  pos_keep: int = 1,
                  neg_keep: int = 1,
                  extra_pos_keep: int = 1,
                  extra_neg_keep: int = 1) -> Tuple[List[str], List[str]]:
    
    # pre-shorten examples to keep them compact
    pos_short = [shorten_text_by_chars(p or "", 120) for p in (pos or []) if (p or "").strip()]
    neg_short = [shorten_text_by_chars(n or "", 120) for n in (neg or []) if (n or "").strip()]

    primary = [TEMPLATE_RULE.format(rule=(rule or "").strip())]
    if pos_short:
        primary.append(TEMPLATE_POS.format(examples=join_examples(pos_short, pos_keep)))
    if neg_short:
        primary.append(TEMPLATE_NEG.format(examples=join_examples(neg_short, neg_keep)))
    primary.append(TEMPLATE_COMMENT.format(comment=(comment or "").strip()))

    extras = []
    if len(pos_short) > pos_keep:
        extras.append(TEMPLATE_POS.format(examples=join_examples(pos_short[pos_keep:], extra_pos_keep)))
    if len(neg_short) > neg_keep:
        extras.append(TEMPLATE_NEG.format(examples=join_examples(neg_short[neg_keep:], extra_neg_keep)))

    return primary, extras

def encode_with_budget(tokenizer: AutoTokenizer,
                       rule: str,
                       pos: List[str],
                       neg: List[str],
                       comment: str,
                       max_length: int = 256) -> Dict[str, List[int]]:
    """
    Budgeting:
      1) Always include full RULE
      2) Ensure 1 short POS and 1 short NEG if available
      3) Add COMMENT (shrink if needed)
      4) Add extra POS/NEG only if room remains

    We let the tokenizer add special tokens automatically.
    """
    primary, extras = make_segments(rule, pos, neg, comment)

    assembled: List[str] = []

    for seg in primary:
        tentative = "\n".join(assembled + [seg])
        ids = tokenizer(tentative, add_special_tokens=True, truncation=False)["input_ids"]
        if len(ids) <= max_length:
            assembled.append(seg)
            continue

        # If comment is the issue, progressively shrink it
        if seg.startswith("COMMENT:"):
            body = seg[len("COMMENT: "):]
            for keep in [800, 600, 400, 300, 200, 160, 120, 100, 80, 60]:
                shrunk = "COMMENT: " + shorten_text_by_chars(body, keep)
                ids2 = tokenizer("\n".join(assembled + [shrunk]),
                                 add_special_tokens=True, truncation=False)["input_ids"]
                if len(ids2) <= max_length:
                    assembled.append(shrunk)
                    break
            # else: if still too long, we skip comment (rare)

    # Try to append extras if space remains
    for seg in extras:
        tentative = "\n".join(assembled + [seg])
        ids = tokenizer(tentative, add_special_tokens=True, truncation=False)["input_ids"]
        if len(ids) <= max_length:
            assembled.append(seg)
        else:
            break

    text = "\n".join(assembled)
    return tokenizer(text, add_special_tokens=True, truncation=True, max_length=max_length)

In [33]:
needed_cols = [
    "row_id","body","rule",
    "positive_example_1","positive_example_2",
    "negative_example_1","negative_example_2"
]
missing = [c for c in needed_cols if c not in train_df.columns]
if missing:
    raise ValueError(f"Train is missing: {missing}")
missing_test = [c for c in needed_cols if c not in test_df.columns]
if missing_test:
    raise ValueError(f"Test is missing: {missing_test}")

if "rule_violation" not in train_df.columns:
    raise ValueError("Train set must contain 'rule_violation' as the label.")


In [34]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Train/val split (stratified if possible)
train_split_df, val_split_df = train_test_split(
    train_df, test_size=0.2, random_state=RANDOM_SEED, stratify=train_df["rule_violation"]
)

# HF Datasets from pandas (keeps columns as-is)
hf_train = Dataset.from_pandas(train_split_df.reset_index(drop=True))
hf_val   = Dataset.from_pandas(val_split_df.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_df.reset_index(drop=True))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def map_fn_train(batch):
    pos_list = _to_list(batch.get("positive_example_1"), batch.get("positive_example_2"))
    neg_list = _to_list(batch.get("negative_example_1"), batch.get("negative_example_2"))
    enc = encode_with_budget(
        tokenizer=tokenizer,
        rule=_clean(batch.get("rule")),
        pos=pos_list,
        neg=neg_list,
        comment=_clean(batch.get("body")),
        max_length=MAX_LEN,
    )
    enc["labels"] = int(batch.get("rule_violation"))
    return enc

def map_fn_test(batch):
    pos_list = _to_list(batch.get("positive_example_1"), batch.get("positive_example_2"))
    neg_list = _to_list(batch.get("negative_example_1"), batch.get("negative_example_2"))
    enc = encode_with_budget(
        tokenizer=tokenizer,
        rule=_clean(batch.get("rule")),
        pos=pos_list,
        neg=neg_list,
        comment=_clean(batch.get("body")),
        max_length=MAX_LEN,
    )
    
    # keep row_id for writing back predictions later
    enc["row_id"] = int(batch.get("row_id")) if pd.notna(batch.get("row_id")) else -1
    # if test also has labels, we’ll use them (optional)
    if "rule_violation" in batch and pd.notna(batch["rule_violation"]):
        enc["labels"] = int(batch["rule_violation"])
    return enc

tokenized_train = hf_train.map(map_fn_train, remove_columns=hf_train.column_names)
tokenized_val   = hf_val.map(map_fn_train,   remove_columns=hf_val.column_names)
tokenized_test  = hf_test.map(map_fn_test,   remove_columns=hf_test.column_names)


Map:   0%|          | 0/1623 [00:00<?, ? examples/s]

Map:   0%|          | 0/406 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to=[],  # no external loggers
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
val_metrics = trainer.evaluate()
val_metrics


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/406 [00:00<?, ?it/s]



{'loss': 0.6959, 'grad_norm': 1.4487887620925903, 'learning_rate': 4.753694581280788e-05, 'epoch': 0.1}
{'loss': 0.6901, 'grad_norm': 1.358240008354187, 'learning_rate': 4.507389162561577e-05, 'epoch': 0.2}
{'loss': 0.685, 'grad_norm': 1.738308310508728, 'learning_rate': 4.261083743842365e-05, 'epoch': 0.3}
{'loss': 0.6844, 'grad_norm': 1.7866308689117432, 'learning_rate': 4.014778325123153e-05, 'epoch': 0.39}
{'loss': 0.6996, 'grad_norm': 2.337038516998291, 'learning_rate': 3.768472906403941e-05, 'epoch': 0.49}
{'loss': 0.698, 'grad_norm': 1.5404223203659058, 'learning_rate': 3.522167487684729e-05, 'epoch': 0.59}
{'loss': 0.6886, 'grad_norm': 1.7242196798324585, 'learning_rate': 3.275862068965517e-05, 'epoch': 0.69}
{'loss': 0.6656, 'grad_norm': 1.6680041551589966, 'learning_rate': 3.0295566502463057e-05, 'epoch': 0.79}
{'loss': 0.6776, 'grad_norm': 3.2148971557617188, 'learning_rate': 2.7832512315270936e-05, 'epoch': 0.89}
{'loss': 0.7019, 'grad_norm': 1.259026050567627, 'learning_ra

  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.6799321174621582, 'eval_accuracy': 0.6083743842364532, 'eval_precision': 0.6082949308755761, 'eval_recall': 0.6407766990291263, 'eval_f1': 0.624113475177305, 'eval_runtime': 3.7038, 'eval_samples_per_second': 109.619, 'eval_steps_per_second': 13.77, 'epoch': 1.0}




{'loss': 0.6769, 'grad_norm': 2.310183048248291, 'learning_rate': 2.29064039408867e-05, 'epoch': 1.08}
{'loss': 0.6798, 'grad_norm': 5.456427097320557, 'learning_rate': 2.0443349753694584e-05, 'epoch': 1.18}
{'loss': 0.6616, 'grad_norm': 3.5358917713165283, 'learning_rate': 1.7980295566502463e-05, 'epoch': 1.28}
{'loss': 0.6162, 'grad_norm': 4.295467376708984, 'learning_rate': 1.5517241379310346e-05, 'epoch': 1.38}
{'loss': 0.6569, 'grad_norm': 5.367374897003174, 'learning_rate': 1.3054187192118228e-05, 'epoch': 1.48}
{'loss': 0.6411, 'grad_norm': 3.9822044372558594, 'learning_rate': 1.0591133004926108e-05, 'epoch': 1.58}
{'loss': 0.634, 'grad_norm': 3.223264694213867, 'learning_rate': 8.12807881773399e-06, 'epoch': 1.67}
{'loss': 0.6451, 'grad_norm': 4.546552658081055, 'learning_rate': 5.665024630541872e-06, 'epoch': 1.77}
{'loss': 0.586, 'grad_norm': 2.9161946773529053, 'learning_rate': 3.201970443349754e-06, 'epoch': 1.87}
{'loss': 0.7026, 'grad_norm': 1.7037737369537354, 'learning_



  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.6149668097496033, 'eval_accuracy': 0.6674876847290641, 'eval_precision': 0.6263345195729537, 'eval_recall': 0.8543689320388349, 'eval_f1': 0.7227926078028748, 'eval_runtime': 3.6399, 'eval_samples_per_second': 111.54, 'eval_steps_per_second': 14.011, 'epoch': 2.0}
{'train_runtime': 109.302, 'train_samples_per_second': 29.698, 'train_steps_per_second': 3.714, 'train_loss': 0.6700526105946508, 'epoch': 2.0}




  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.6149668097496033,
 'eval_accuracy': 0.6674876847290641,
 'eval_precision': 0.6263345195729537,
 'eval_recall': 0.8543689320388349,
 'eval_f1': 0.7227926078028748,
 'eval_runtime': 3.7981,
 'eval_samples_per_second': 106.894,
 'eval_steps_per_second': 13.428,
 'epoch': 2.0}

In [37]:
preds = trainer.predict(tokenized_test.remove_columns(
    [c for c in tokenized_test.column_names if c not in ["input_ids", "attention_mask", "labels"]]
))
logits = preds.predictions
probs = (np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True))
pred_labels = logits.argmax(axis=1)




  0%|          | 0/2 [00:00<?, ?it/s]