In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
# for dirname, _, filenames in os.walk('.'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pytorch_metric_learning

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from pytorch_metric_learning.losses import SupConLoss

# -----------------------------
# Load and preprocess data
# -----------------------------
# Use Kaggle paths when running on Kaggle
# MODEL_PATH = "/kaggle/input/xlm-roberta-base-offline/xlm_roberta_base_offline"
# MODEL_PATH = "C:/Users/satra/Downloads/xlm_roberta_base_offline"
MODEL_PATH = "xlm-roberta-base"


# trn = "/kaggle/input/jigsaw-agile-community-rules/train.csv"
# tst = "/kaggle/input/jigsaw-agile-community-rules/test.csv"
trn = "/content/drive/MyDrive/Colab Notebooks/train.csv"
tst = "/content/drive/MyDrive/Colab Notebooks/test.csv"
df_trn = pd.read_csv(trn)
# df_trn = df_trn.sample(frac=.05, random_state=42).reset_index(drop=True)

df_tst = pd.read_csv(tst)


def get_device():
    # Try to detect NVIDIA CUDA GPU first
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using NVIDIA CUDA GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")
        return device

    # If no NVIDIA CUDA GPU, try to detect DirectML GPU
    try:
        if torch_directml.is_available():
            device = torch_directml.device()
            print(f"Using DirectML GPU: {device}")
            # Add a small test to ensure it's truly usable
            try:
                _ = torch.tensor([1], device=device)
            except Exception as e:
                print(f"Warning: DirectML device found but not usable ({e}). Falling back to CPU.")
                return torch.device("cpu")
            return device
        else:
            print("DirectML is NOT available.")
    except ImportError:
        print("torch_directml not installed.")
    except Exception as e:
        print(f"Error checking DirectML: {e}. Falling back to CPU.")

    # If neither GPU is found, fall back to CPU
    device = torch.device("cpu")
    print("No GPU (NVIDIA CUDA or DirectML) found. Using CPU.")
    return device


def fill_empty_examples_pandas(df):
    example_cols = ['positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2']
    for col in example_cols:
        df[col] = df[col].fillna('').astype(str)

    df['positive_example_1'] = df['positive_example_1'].mask(df['positive_example_1'] == '', df['positive_example_2'])
    df['positive_example_2'] = df['positive_example_2'].mask(df['positive_example_2'] == '', df['positive_example_1'])

    df['negative_example_1'] = df['negative_example_1'].mask(df['negative_example_1'] == '', df['negative_example_2'])
    df['negative_example_2'] = df['negative_example_2'].mask(df['negative_example_2'] == '', df['negative_example_1'])

    return df


def get_text(value):
    return str(value) if pd.notna(value) else ''


def extract_texts(row):
    return {
        "body": get_text(row["body"]),
        "rule": get_text(row["rule"]),
        "subreddit": get_text(row["subreddit"]),
        "pos1": f"{get_text(row['positive_example_1'])}",
        "pos2": f"{get_text(row['positive_example_2'])}",
        "neg1": f"{get_text(row['negative_example_1'])}",
        "neg2": f"{get_text(row['negative_example_2'])}",
    }

df_trn = fill_empty_examples_pandas(df_trn)
df_tst = fill_empty_examples_pandas(df_tst)

df_trn["inputs"] = df_trn.apply(extract_texts, axis=1)
df_tst["inputs"] = df_tst.apply(extract_texts, axis=1) # Apply to test data too

N_EPOCHS = 8
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# -----------------------------
# Dataset
# -----------------------------
class MultiInputDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128, is_test=False): # Renamed df_trn to df for generality
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        item = {}
        for field in ["text_to_classify", "rule", "subreddit"]:
            encoded = self.tokenizer(
                row[field],
                truncation=True,
                padding='max_length',
                max_length=self.max_len,
                return_tensors="pt"
            )

            for key in encoded:
                item[f"{field}_{key}"] = encoded[key].squeeze(0)
        if not self.is_test:
          item["label"] = torch.tensor(row["rule_violation"], dtype=torch.float32)
        return item

# -----------------------------
# Model
# -----------------------------
class MultiInputBERT(nn.Module):
    def __init__(self, model_name=MODEL_PATH, embedding_dim=256): # Added embedding_dim
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)

        # Projection head for contrastive learning
        self.projection_head = nn.Sequential(
            nn.Linear(768 * 3, 512), # Input is concatenated CLS tokens
            nn.ReLU(),
            nn.Linear(512, embedding_dim) # Output embedding for contrastive loss
        )

        # Original classifier for downstream task (can be re-attached or fine-tuned)
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim, 1) # Classifier takes the projected embedding
        )

    def forward(self, inputs):
        cls_outputs = []
        for field in ["text_to_classify", "rule", "subreddit"]:
            out = self.bert(
                input_ids=inputs[f"{field}_input_ids"],
                attention_mask=inputs[f"{field}_attention_mask"]
            )
            cls_outputs.append(out.last_hidden_state[:, 0])  # CLS token

        x = torch.cat(cls_outputs, dim=1)
        x = self.dropout(x)

        # Get embeddings from projection head
        embeddings = self.projection_head(x)

        # Get logits from classifier (for downstream task)
        logits = self.classifier(embeddings)

        return logits, embeddings # Return both


# -----------------------------
# Training and Evaluation
# -----------------------------
device = get_device()
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")


oof_preds = np.zeros(len(df_trn))
test_preds_folds = [] # This is correct

for fold, (train_idx, val_idx) in enumerate(skf.split(df_trn, df_trn["rule_violation"])):
    print(f"----- Fold {fold+1} -----")
    train_df = df_trn.iloc[train_idx].reset_index(drop=True)
    val_df = df_trn.iloc[val_idx].reset_index(drop=True)

    # Use original training data without expansion
    fold_train_df_for_model = train_df.copy()
    fold_train_df_for_model['text_to_classify'] = fold_train_df_for_model['body']

    # Prepare the VALIDATION data for this fold (using original body)
    fold_val_df_for_model = val_df.copy()
    fold_val_df_for_model['text_to_classify'] = fold_val_df_for_model['body']

    train_dataset = MultiInputDataset(fold_train_df_for_model, tokenizer)
    val_dataset = MultiInputDataset(fold_val_df_for_model, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)

    # Prepare the TEST data for this fold (using original body)
    fold_tst_df_for_model = df_tst.copy()
    fold_tst_df_for_model['text_to_classify'] = fold_tst_df_for_model['body']
    test_loader = DataLoader(MultiInputDataset(fold_tst_df_for_model, tokenizer, is_test=True), batch_size=8, shuffle=False)


    model = MultiInputBERT().to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Define the loss functions
    contrastive_criterion = SupConLoss(temperature=0.07)
    classification_criterion = nn.BCEWithLogitsLoss()

    num_training_steps_per_fold = len(train_loader) * N_EPOCHS
    num_warmup_steps_per_fold = int(num_training_steps_per_fold * 0.05)

    # Initialize the scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps_per_fold,
        num_training_steps=num_training_steps_per_fold
    )

    # Training Loop for this fold
    best_auc = -1.0 # Track best AUC for this fold
    best_model_state = None # To save the best model for this fold

    for epoch in range(N_EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)

            optimizer.zero_grad()

            # Get both logits and embeddings
            logits, embeddings = model(inputs)

            # --- Combined Loss ---
            # 1. Contrastive Loss
            loss_supcon = contrastive_criterion(embeddings, labels)

            # 2. Classification Loss
            loss_bce = classification_criterion(logits.squeeze(-1), labels)

            # Combine the two losses. You can experiment with weighting them, e.g., loss = 0.5 * loss_supcon + 0.5 * loss_bce
            loss = loss_supcon + loss_bce

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

        # Eval (for monitoring, still using AUC on classification task)
        model.eval()
        preds_raw, labels_all = [], []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch+1}"):
                inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
                labels = batch["label"].to(device)

                # Get logits for evaluation
                logits, _ = model(inputs)
                logits = logits.squeeze(-1) # Squeeze to [batch_size]

                probs = torch.sigmoid(logits).detach().cpu().tolist()
                preds_raw.extend(probs)
                labels_all.extend(labels.cpu().tolist())

            # Hard labels (for classification report, optional)
            preds = [int(p > 0.5) for p in preds_raw]

            # Print metrics
            print(classification_report(labels_all, preds, digits=3, zero_division=0))

            curr_auc = roc_auc_score(labels_all, preds_raw)
            print(f"AUC Score: {curr_auc:.4f}")

            # Add sanity check for labels
            print(f"Sample of labels in validation batch: {labels.cpu().numpy()[:5]}")

            # Save the best model for this fold based on validation AUC
            if curr_auc > best_auc:
                best_auc = curr_auc
                best_model_state = model.state_dict() # Save model weights
                print(f"  -> New best Val AUC for Fold {fold+1}: {best_auc:.4f}")

    # 6. Load best model state for this fold
    model.load_state_dict(best_model_state) # Use best_model_state
    print(f"Fold {fold+1} Best Val AUC: {best_auc:.4f}")

    # Make OOF predictions for this fold's validation set
    model.eval()
    fold_val_preds_list = []
    fold_val_true_list = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Fold {fold+1} OOF Prediction"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)
            logits, _ = model(inputs)
            logits = logits.squeeze(-1) # Squeeze to [batch_size]
            probs = torch.sigmoid(logits).detach().cpu().tolist()
            fold_val_preds_list.extend(probs)
            fold_val_true_list.extend(labels.cpu().tolist())

    oof_fold_auc_check = roc_auc_score(fold_val_true_list, fold_val_preds_list)
    print(f"Fold {fold+1} OOF AUC Check: {oof_fold_auc_check:.4f} (Must match Best Val AUC)")

    oof_preds[val_idx] = np.array(fold_val_preds_list) # Use val_idx from kf.split

    # Make predictions on the TEST set using this fold's best model
    test_fold_preds = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Fold {fold+1} Test Prediction"):
            inputs = {k: v.to(device) for k, v in batch.items()}
            logits, _ = model(inputs)
            logits = logits.squeeze(-1) # Squeeze to [batch_size]
            probs = torch.sigmoid(logits).detach().cpu().tolist()
            test_fold_preds.extend(probs)

    test_preds_folds.append(test_fold_preds) # Store test predictions from this fold


# -----------------------------
# Final Calculation and Submission
# -----------------------------
overall_oof_auc = roc_auc_score(df_trn['rule_violation'], oof_preds)
print(f"Overall {k_folds}-Fold OOF AUC: {overall_oof_auc:.4f} ---")

# Average test predictions across all folds
final_test_predictions = np.mean(test_preds_folds, axis=0)

# Create final submission file
submission = pd.DataFrame({
    "row_id": df_tst["row_id"],
    "rule_violation": final_test_predictions
})
submission.to_csv("submission.csv", index=False) # Save with a distinct name
print("K-Fold multi-input submission.csv created successfully!")
print(submission.head(10))


In [None]:
from google.colab import drive
drive.mount('/content/drive')