In [10]:
from sklearn.metrics import classification_report
from transformers import BertTokenizer
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

# Load the test set
Data_dir="../jigsaw-dataset"
model_checkpoint = "./bert-model_checkpoint_10p"

# Load the fine-tuned model
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_checkpoint)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
class TestDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Include labels in the output
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Add labels for evaluation
        return item

In [12]:
test = pd.read_csv(os.path.join(Data_dir, "test_public_expanded.csv"))

# Clean the text data
test['comment_text'] = test['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n', ' ', regex=True)

# Add class labels (binary: 0 for non-toxic, 1 for toxic)
test['label'] = np.where(test['toxicity'] >= 0.5, 1, 0)


In [13]:
# Tokenize the test set
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Use the tokenizer from the saved checkpoint
test_encodings = tokenizer(
    list(test['comment_text']),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Create test dataset and DataLoader
test_labels = test['label'].values
test_dataset = TestDataset(test_encodings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [14]:
# Evaluate the model with tqdm
model.eval()
test_predictions, test_true_labels = [], []

with torch.no_grad():
    loop = tqdm(test_loader, desc="Evaluating", leave=True)
    for batch in loop:
        # Move inputs to the device
        batch = {key: val.to(device) for key, val in batch.items()}
        
        # Extract labels from the batch
        labels = batch.pop("labels")  # Remove 'labels' from the batch for model input
        
        # Perform inference
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        # Store predictions and true labels
        test_predictions.extend(preds.cpu().numpy())
        test_true_labels.extend(labels.cpu().numpy())


# Generate classification report
print(classification_report(test_true_labels, test_predictions, target_names=["Non-Toxic", "Toxic"]))

# Save predictions with test data
test['predicted_label'] = test_predictions
test.to_csv("test_predictions.csv", index=False)
print("Test predictions saved to 'test_predictions.csv'.")

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Evaluating: 100%|██████████| 6083/6083 [05:24<00:00, 18.73it/s]


              precision    recall  f1-score   support

   Non-Toxic       0.96      0.99      0.97     89649
       Toxic       0.76      0.51      0.61      7671

    accuracy                           0.95     97320
   macro avg       0.86      0.75      0.79     97320
weighted avg       0.94      0.95      0.94     97320

Test predictions saved to 'test_predictions.csv'.


In [15]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import roc_auc_score

Data_dir="./jigsaw-dataset"

# Define your helper functions for computing metrics
def compute_auc(y_true, y_pred):
    """Compute ROC-AUC score."""
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_bias_aucs(y_true, y_pred, subgroup_mask):
    """
    Compute Bias AUCs: Subgroup AUC, BPSN AUC, BNSP AUC.
    
    Args:
        y_true (np.array): True binary labels (0/1).
        y_pred (np.array): Predicted scores or probabilities.
        subgroup_mask (np.array): Mask for the specific subgroup.
    
    Returns:
        dict: Subgroup AUC, BPSN AUC, and BNSP AUC.
    """
    subgroup_true = y_true[subgroup_mask]
    subgroup_pred = y_pred[subgroup_mask]

    background_mask = ~subgroup_mask
    background_true = y_true[background_mask]
    background_pred = y_pred[background_mask]

    # Subgroup AUC
    subgroup_auc = compute_auc(subgroup_true, subgroup_pred)

    # BPSN AUC (Background Positive, Subgroup Negative)
    bpsn_mask = background_true == 1
    bpsn_auc = compute_auc(
        np.concatenate([subgroup_true, background_true[bpsn_mask]]),
        np.concatenate([subgroup_pred, background_pred[bpsn_mask]])
    )

    # BNSP AUC (Background Negative, Subgroup Positive)
    bnsp_mask = background_true == 0
    bnsp_auc = compute_auc(
        np.concatenate([subgroup_true, background_true[bnsp_mask]]),
        np.concatenate([subgroup_pred, background_pred[bnsp_mask]])
    )

    return {"subgroup_auc": subgroup_auc, "bpsn_auc": bpsn_auc, "bnsp_auc": bnsp_auc}

def generalized_mean(values, p):
    """Compute the generalized mean."""
    values = np.array(values)
    values = values[~np.isnan(values)]  # Ignore NaNs
    return (np.mean(values**p))**(1/p)

def compute_final_metric(overall_auc, bias_aucs, p=-5, weight=0.25):
    """
    Compute the final metric combining overall AUC and bias AUCs.
    
    Args:
        overall_auc (float): ROC-AUC score for the entire dataset.
        bias_aucs (list of dict): List of AUC dictionaries for each subgroup.
        p (float): Power for the generalized mean.
        weight (float): Weight for each component in the final score.
    
    Returns:
        float: Final metric score.
    """
    subgroup_aucs = [bias["subgroup_auc"] for bias in bias_aucs]
    bpsn_aucs = [bias["bpsn_auc"] for bias in bias_aucs]
    bnsp_aucs = [bias["bnsp_auc"] for bias in bias_aucs]

    gm_subgroup_auc = generalized_mean(subgroup_aucs, p)
    gm_bpsn_auc = generalized_mean(bpsn_aucs, p)
    gm_bnsp_auc = generalized_mean(bnsp_aucs, p)

    bias_score = (weight * gm_subgroup_auc +
                  weight * gm_bpsn_auc +
                  weight * gm_bnsp_auc)

    return (weight * overall_auc) + bias_score

# Example usage with your dataframe
def evaluate_metrics(dataframe, subgroups, true_label_col, predicted_label_col):
    """
    Evaluate metrics on a dataframe with true and predicted labels.
    
    Args:
        dataframe (pd.DataFrame): Dataframe with predictions and subgroups.
        subgroups (list): List of subgroup column names.
        true_label_col (str): Column name for true labels.
        predicted_label_col (str): Column name for predicted probabilities/scores.
    
    Returns:
        dict: A dictionary with overall AUC, bias AUCs, and final metric.
    """
    y_true = dataframe[true_label_col].values
    y_pred = dataframe[predicted_label_col].values

    # Compute overall AUC
    overall_auc = compute_auc(y_true, y_pred)

    # Compute bias AUCs for each subgroup
    bias_aucs = []
    for subgroup in subgroups:
        if subgroup in dataframe:
            subgroup_mask = dataframe[subgroup].values > 0.3  # Assuming float subgroup values
            bias_auc = compute_bias_aucs(y_true, y_pred, subgroup_mask)
            bias_aucs.append(bias_auc)

    # Compute final metric
    final_metric = compute_final_metric(overall_auc, bias_aucs)

    return {
        "overall_auc": overall_auc,
        "bias_aucs": bias_aucs,
        "final_metric": final_metric
    }

def merge_train_val(train_df, val_df, id_col, predicted_label_col):
    """
    Merge the train and validation datasets based on the id column.
    
    Args:
        train_df (pd.DataFrame): The train dataset with subgroup information.
        val_df (pd.DataFrame): The validation dataset with predictions.
        id_col (str): The name of the id column common to both datasets.
        predicted_label_col (str): The name of the column in val_df with predicted labels.
    
    Returns:
        pd.DataFrame: A merged DataFrame containing rows from train_df with matching ids in val_df,
                      and an additional column for predicted labels.
    """
    # Merge the two datasets on the id column
    merged_df = train_df.merge(
        val_df[[id_col, predicted_label_col]],
        on=id_col,
        how="inner"  # Keep only rows from train_df that have matching ids in val_df
    )
    return merged_df

# Ensure both IDs are of the same type (e.g., integers or strings)
test_result_data = pd.read_csv("test_predictions.csv")

subgroups = [
    "asian", "atheist", "bisexual", "black", "buddhist", "christian",
    "female", "heterosexual", "hindu", "homosexual_gay_or_lesbian",
    "intellectual_or_learning_disability", "jewish", "latino", "male",
    "muslim", "other_disability", "other_gender", "other_race_or_ethnicity",
    "other_religion", "other_sexual_orientation", "physical_disability",
    "psychiatric_or_mental_illness", "transgender", "white"
]
metrics = evaluate_metrics(test_result_data, subgroups, "label", "predicted_label")
print(metrics)



{'overall_auc': 0.7482583326439677, 'bias_aucs': [{'subgroup_auc': 0.6837535014005602, 'bpsn_auc': 0.7530907083208723, 'bnsp_auc': 0.6789211257236556}, {'subgroup_auc': 0.7646627565982405, 'bpsn_auc': 0.7469869765055656, 'bnsp_auc': 0.7659341127366426}, {'subgroup_auc': 0.562962962962963, 'bpsn_auc': 0.7180144555975607, 'bnsp_auc': 0.5932068400093699}, {'subgroup_auc': 0.6547459332550729, 'bpsn_auc': 0.7269616049941484, 'bnsp_auc': 0.6760426609048923}, {'subgroup_auc': 0.6, 'bpsn_auc': 0.7550514926345978, 'bnsp_auc': 0.5932068400093699}, {'subgroup_auc': 0.6334967973298481, 'bpsn_auc': 0.7490134062435156, 'bnsp_auc': 0.6327417237303001}, {'subgroup_auc': 0.6751279863481229, 'bpsn_auc': 0.7426794789827207, 'bnsp_auc': 0.6807068400093699}, {'subgroup_auc': 0.6011904761904762, 'bpsn_auc': 0.731241968825074, 'bnsp_auc': 0.6182068400093699}, {'subgroup_auc': 0.75, 'bpsn_auc': 0.7550514926345978, 'bnsp_auc': 0.7432068400093699}, {'subgroup_auc': 0.6324123989218329, 'bpsn_auc': 0.726749605842