In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

.\Jigsaw.ipynb
.\Jigsaw.py
.\Jigsaw_v14.py
.\jigsaw_v15.ipynb
.\jigsaw_v15_contrastive.ipynb
.\jigsaw_v15_contrastive.py
.\jigsaw_v17.ipynb
.\jigsaw_v17_contrastive.ipynb
.\output.log
.\README.md
.\sample_submission.csv
.\StackTrace
.\submission.csv
.\test.csv
.\train.csv
.\.git\COMMIT_EDITMSG
.\.git\config
.\.git\description
.\.git\FETCH_HEAD
.\.git\HEAD
.\.git\index
.\.git\ORIG_HEAD
.\.git\packed-refs
.\.git\hooks\applypatch-msg.sample
.\.git\hooks\commit-msg.sample
.\.git\hooks\fsmonitor-watchman.sample
.\.git\hooks\post-update.sample
.\.git\hooks\pre-applypatch.sample
.\.git\hooks\pre-commit.sample
.\.git\hooks\pre-merge-commit.sample
.\.git\hooks\pre-push.sample
.\.git\hooks\pre-rebase.sample
.\.git\hooks\pre-receive.sample
.\.git\hooks\prepare-commit-msg.sample
.\.git\hooks\push-to-checkout.sample
.\.git\hooks\sendemail-validate.sample
.\.git\hooks\update.sample
.\.git\info\exclude
.\.git\logs\HEAD
.\.git\logs\refs\heads\main
.\.git\logs\refs\remotes\origin\HEAD
.\.git\logs\refs\

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

# -----------------------------
# Paths and Hyperparameters
# -----------------------------
MODEL_PATH = "C:/Users/satra/Downloads/xlm_roberta_base_offline"
TRAIN_PATH = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/train.csv"
TEST_PATH = "C:/Users/satra/Downloads/jigsaw-agile-community-rules/test.csv"

N_EPOCHS = 3
k_folds = 5
BATCH_SIZE = 8
MAX_LEN = 128

# -----------------------------
# Load and preprocess data
# -----------------------------
df_trn = pd.read_csv(TRAIN_PATH)
df_trn = df_trn.sample(frac=0.01, random_state=42).reset_index(drop=True)
df_tst = pd.read_csv(TEST_PATH)

def fill_empty_examples_pandas(df):
    example_cols = ['positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2']
    for col in example_cols:
        df[col] = df[col].fillna('').astype(str)

    df['positive_example_1'] = df['positive_example_1'].mask(df['positive_example_1'] == '', df['positive_example_2'])
    df['positive_example_2'] = df['positive_example_2'].mask(df['positive_example_2'] == '', df['positive_example_1'])
    df['negative_example_1'] = df['negative_example_1'].mask(df['negative_example_1'] == '', df['negative_example_2'])
    df['negative_example_2'] = df['negative_example_2'].mask(df['negative_example_2'] == '', df['negative_example_1'])
    return df

def extract_texts(row):
    return {
        "body": row["body"],
        "rule": row["rule"],
        "subreddit": row["subreddit"],
        "pos1": row['positive_example_1'],
        "pos2": row['positive_example_2'],
        "neg1": row['negative_example_1'],
        "neg2": row['negative_example_2'],
    }

df_trn = fill_empty_examples_pandas(df_trn)
df_tst = fill_empty_examples_pandas(df_tst)
df_trn["inputs"] = df_trn.apply(extract_texts, axis=1)
df_tst["inputs"] = df_tst.apply(extract_texts, axis=1)

# -----------------------------
# Dataset
# -----------------------------
class MultiInputDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=MAX_LEN, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        item = {}
        for field in ["body", "rule", "subreddit", "pos1", "pos2", "neg1", "neg2"]:
            encoded = self.tokenizer(row["inputs"][field], truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
            for key in encoded:
                item[f"{field}_{key}"] = encoded[key].squeeze(0)
        if not self.is_test:
            item["label"] = torch.tensor(row["rule_violation"], dtype=torch.float32)
        return item

# -----------------------------
# Contrastive Loss
# -----------------------------
class SupConLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.temperature = temperature

    def forward(self, features, labels):
        device = features.device
        features = nn.functional.normalize(features, dim=1)
        similarity_matrix = torch.matmul(features, features.T) / self.temperature

        logits_mask = torch.eye(features.size(0), device=device).bool()
        similarity_matrix.masked_fill_(logits_mask, -9e15)

        labels = labels.contiguous().view(-1, 1)
        mask = torch.eq(labels, labels.T).float().to(device)
        mask.fill_diagonal_(0)

        exp_sim = torch.exp(similarity_matrix)
        denom = exp_sim.sum(dim=1, keepdim=True)

        log_prob = similarity_matrix - torch.log(denom + 1e-12)
        loss = - (mask * log_prob).sum(dim=1) / (mask.sum(dim=1) + 1e-12)
        return loss.mean()

# -----------------------------
# Model
# -----------------------------
class MultiInputBERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained(MODEL_PATH)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(768 * 7, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, inputs, return_cls_only=False):
        cls_outputs = []
        for field in ["body", "rule", "subreddit", "pos1", "pos2", "neg1", "neg2"]:
            out = self.bert(input_ids=inputs[f"{field}_input_ids"], attention_mask=inputs[f"{field}_attention_mask"])
            cls_outputs.append(out.last_hidden_state[:, 0])

        x = torch.cat(cls_outputs, dim=1)
        if return_cls_only:
            return x
        x = self.dropout(x)
        return self.classifier(x)

# -----------------------------
# Training
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
oof_preds = np.zeros(len(df_trn))
test_preds_folds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(df_trn, df_trn['rule_violation'])):
    print(f"\n--- Fold {fold + 1} ---")
    train_df = df_trn.iloc[train_idx].reset_index(drop=True)
    val_df = df_trn.iloc[val_idx].reset_index(drop=True)

    train_loader = DataLoader(MultiInputDataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(MultiInputDataset(val_df, tokenizer), batch_size=BATCH_SIZE)
    test_loader = DataLoader(MultiInputDataset(df_tst, tokenizer, is_test=True), batch_size=BATCH_SIZE)

    model = MultiInputBERT().to(device)
    optimizer = AdamW(model.parameters(), lr=5e-6)
    scheduler = get_linear_schedule_with_warmup(optimizer, int(len(train_loader) * N_EPOCHS * 0.05), len(train_loader) * N_EPOCHS)

    contrastive_loss_fn = SupConLoss()
    bce_loss_fn = nn.BCEWithLogitsLoss()

    best_auc = -1
    best_state = None

    for epoch in range(N_EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            cls_feats = model(inputs, return_cls_only=True)
            logits = model(inputs).squeeze(-1)

            loss_bce = bce_loss_fn(logits, labels)
            loss_con = contrastive_loss_fn(cls_feats, labels)
            loss = 0.7 * loss_bce + 0.3 * loss_con

            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        val_logits, val_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
                labels = batch["label"].to(device)
                logits = model(inputs).squeeze(-1)
                probs = torch.sigmoid(logits).cpu().numpy()
                val_logits.extend(probs)
                val_labels.extend(labels.cpu().numpy())

        val_auc = roc_auc_score(val_labels, val_logits)
        print(f"Val AUC: {val_auc:.4f}")

        if val_auc > best_auc:
            best_auc = val_auc
            best_state = model.state_dict()

    model.load_state_dict(best_state)
    print(f"Fold {fold + 1} Best AUC: {best_auc:.4f}")
    oof_preds[val_idx] = np.array(val_logits)

    # Test prediction
    model.eval()
    test_probs = []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            logits = model(inputs).squeeze(-1)
            probs = torch.sigmoid(logits).cpu().numpy()
            test_probs.extend(probs)
    test_preds_folds.append(test_probs)

# -----------------------------
# Final output
# -----------------------------
overall_auc = roc_auc_score(df_trn['rule_violation'], oof_preds)
print(f"\nOverall OOF AUC: {overall_auc:.4f}")

submission = pd.DataFrame({
    "row_id": df_tst["row_id"],
    "rule_violation": np.mean(test_preds_folds, axis=0)
})
submission.to_csv("submission.csv", index=False)
print("submission.csv created.")


Some weights of XLMRobertaModel were not initialized from the model checkpoint at C:/Users/satra/Downloads/xlm_roberta_base_offline and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Fold 1 ---


  return forward_call(*args, **kwargs)
Epoch 1: 100%|██████████| 2/2 [02:06<00:00, 63.18s/it]


Loss: 1.1271
Val AUC: 0.7500


  return forward_call(*args, **kwargs)
Epoch 2: 100%|██████████| 2/2 [02:08<00:00, 64.38s/it]


Loss: 1.0878
Val AUC: 0.7500


  return forward_call(*args, **kwargs)
Epoch 3: 100%|██████████| 2/2 [02:37<00:00, 78.73s/it]


Loss: 1.0932
Val AUC: 0.7500
Fold 1 Best AUC: 0.7500


  return forward_call(*args, **kwargs)
Some weights of XLMRobertaModel were not initialized from the model checkpoint at C:/Users/satra/Downloads/xlm_roberta_base_offline and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Fold 2 ---


Epoch 1: 100%|██████████| 2/2 [03:40<00:00, 110.30s/it]


Loss: 1.1275
Val AUC: 1.0000


  return forward_call(*args, **kwargs)
Epoch 2: 100%|██████████| 2/2 [02:16<00:00, 68.19s/it]


Loss: 1.0869
Val AUC: 1.0000


  return forward_call(*args, **kwargs)
Epoch 3: 100%|██████████| 2/2 [02:35<00:00, 77.62s/it]


Loss: 1.1019
Val AUC: 1.0000
Fold 2 Best AUC: 1.0000


  return forward_call(*args, **kwargs)
Some weights of XLMRobertaModel were not initialized from the model checkpoint at C:/Users/satra/Downloads/xlm_roberta_base_offline and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Fold 3 ---


Epoch 1: 100%|██████████| 2/2 [02:49<00:00, 84.97s/it]


Loss: 1.1114
Val AUC: 0.7500


  return forward_call(*args, **kwargs)
Epoch 2: 100%|██████████| 2/2 [02:41<00:00, 80.99s/it]


Loss: 1.1390
Val AUC: 0.7500


  return forward_call(*args, **kwargs)
Epoch 3: 100%|██████████| 2/2 [02:37<00:00, 78.65s/it]


Loss: 1.0870
Val AUC: 0.7500
Fold 3 Best AUC: 0.7500


  return forward_call(*args, **kwargs)
Some weights of XLMRobertaModel were not initialized from the model checkpoint at C:/Users/satra/Downloads/xlm_roberta_base_offline and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Fold 4 ---


Epoch 1: 100%|██████████| 2/2 [02:58<00:00, 89.47s/it]


Loss: 1.1292
Val AUC: 0.7500


  return forward_call(*args, **kwargs)
Epoch 2: 100%|██████████| 2/2 [02:50<00:00, 85.47s/it]


Loss: 1.1108
Val AUC: 0.7500


  return forward_call(*args, **kwargs)
Epoch 3: 100%|██████████| 2/2 [03:00<00:00, 90.13s/it]


Loss: 1.1330
Val AUC: 0.7500
Fold 4 Best AUC: 0.7500


  return forward_call(*args, **kwargs)
Some weights of XLMRobertaModel were not initialized from the model checkpoint at C:/Users/satra/Downloads/xlm_roberta_base_offline and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Fold 5 ---


Epoch 1: 100%|██████████| 2/2 [02:05<00:00, 62.73s/it]


Loss: 1.1709
Val AUC: 1.0000


  return forward_call(*args, **kwargs)
Epoch 2: 100%|██████████| 2/2 [02:00<00:00, 60.07s/it]


Loss: 1.1649
Val AUC: 0.6667


  return forward_call(*args, **kwargs)
Epoch 3: 100%|██████████| 2/2 [01:55<00:00, 57.88s/it]


Loss: 1.1602
Val AUC: 0.6667
Fold 5 Best AUC: 1.0000


  return forward_call(*args, **kwargs)



Overall OOF AUC: 0.5152
submission.csv created.
