see second cell for AUC-ROC / AUC-PR
	False Positive Rate (FPR) & False Negative Rate (FNR)


In [None]:
# --- 1. Data Preparation ---
file_path = "/content/final_labels.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# If your CSV does not have 'toxicity_level' but has 'level_1', rename it.
if 'toxicity_level' not in df.columns and 'level_1' in df.columns:
    df = df.rename(columns={'level_1': 'toxicity_level'})

# Filter and drop missing values
df = df[['body', 'toxicity_level', 'split']].dropna()

# Encode the toxicity labels (the available labels should be: ['Misogynistic', 'Nonmisogynistic'])
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['toxicity_level'])

# Print available labels to verify
print("Available labels:", list(label_encoder.classes_))
# Set the target label to one of the available labels (case-sensitive)
target_label = "Misogynistic"  # Use the exact available label

# Split into training and testing datasets
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

# --- 2. Custom Dataset Class ---
import torch
from torch.utils.data import Dataset

class ToxicityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# --- 3. Model and Tokenizer Setup ---
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label_encoder.classes_)  # Should be 2 in this case
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Create dataset objects
train_dataset = ToxicityDataset(train_df["body"], train_df["label"], tokenizer)
test_dataset = ToxicityDataset(test_df["body"], test_df["label"], tokenizer)

# --- 4. Metrics Calculation ---
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# --- 5. Training Arguments and Trainer ---
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# --- 6. Fine-Tune the Classifier ---
trainer.train()

# Evaluate the model on the test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
# Use .loc to avoid SettingWithCopyWarning
test_df.loc[:, 'predicted_label'] = pred_labels

# --- 7. Detect Toxic/Harmful Comments ---
try:
    toxic_label_index = list(label_encoder.classes_).index(target_label)
except ValueError:
    raise ValueError(f"The label '{target_label}' is not present in your dataset labels.")

toxic_comments_df = test_df[test_df['predicted_label'] == toxic_label_index]

print("Detected Toxic/Harmful Comments:")
for idx, row in toxic_comments_df.iterrows():
    print(f"Index: {idx}")
    print(row['body'])
    print("------")

# --- 8. Generate Alternative Phrasings for Toxic Comments ---
from transformers import pipeline
generation_model = pipeline("text2text-generation", model="t5-base")

def generate_alternative(text):
    prompt = f"Rewrite the following comment to be more respectful and constructive: {text}"
    result = generation_model(prompt, max_length=128, truncation=True)
    alternative = result[0]['generated_text']
    return alternative

print("\nAlternative Phrasings for Toxic/Harmful Comments:")
for idx, row in toxic_comments_df.iterrows():
    original_comment = row['body']
    alt = generate_alternative(original_comment)
    print(f"Index: {idx}")
    print("Original:", original_comment)
    print("Alternative:", alt)
    print("------")


Available labels: ['Misogynistic', 'Nonmisogynistic']


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2435,0.344589,0.900846,0.811523,0.900846,0.853854
2,0.318,0.387605,0.900846,0.811523,0.900846,0.853854
3,0.3229,0.333406,0.900846,0.870206,0.900846,0.872946


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, 'predicted_label'] = pred_labels


Detected Toxic/Harmful Comments:
Index: 253
This situation has upset me greatly.  Not necessarily because it's because JBP (whom I have a lot of respect for), but because the reaction to this news from the social justice crowd has been _sheer glee_.  "I hope he dies a long and painful death" is a legitimate quote I've seen in response to JBP getting ill.  Sentiments expressing that he 'deserves it' and 'is a hypocrite' are widespread.

How fucking terrible do you have to be as a person to _actively revel in someone's misery_, especially when that someone has had nearly both himself and his wife die in short order?

It really concerns me that people have always been this awful and that social media has just amplified the capability of being a piece of pond scum. 

I guess this is a major problem of 'woke culture' - the desire of death to your political opponents seems to be an inherently "progressive" position nowadays.  Which is absolutely sickening.
------
Index: 439
I should create o

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Alternative Phrasings for Toxic/Harmful Comments:
Index: 253
Original: This situation has upset me greatly.  Not necessarily because it's because JBP (whom I have a lot of respect for), but because the reaction to this news from the social justice crowd has been _sheer glee_.  "I hope he dies a long and painful death" is a legitimate quote I've seen in response to JBP getting ill.  Sentiments expressing that he 'deserves it' and 'is a hypocrite' are widespread.

How fucking terrible do you have to be as a person to _actively revel in someone's misery_, especially when that someone has had nearly both himself and his wife die in short order?

It really concerns me that people have always been this awful and that social media has just amplified the capability of being a piece of pond scum. 

I guess this is a major problem of 'woke culture' - the desire of death to your political opponents seems to be an inherently "progressive" position nowadays.  Which is absolutely sickening.
Alterna

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Index: 2459
Original: A Man's class depends on his job or inheritance. 
A Woman's class depends on the Beta Cucks she Dates. 
That's the truth about female nature. I don't know about All men, but every female I've ever seen has left her friends and relatives to upgrade her "Class" and "Standards" which derived in the first place because of some Rich Guy .  
Men are such simple minded humans and don't always "Choose" friends based on discrimination. Females are the exact opposite. 
The Guy above seriously needs to tell her to FUCK Off!! The class she's talking about changing in him won't even matter when she'll monkey branch on some other dude. It's his money, time and standard and he can't even have conversations with real interesting people other than the shallow WHORES? That's why MGTOW  forever guys.
Alternative: : A Man's class depends on his job or inheritance. A Woman's class depends on the Beta Cucks she Dates. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [2]:
# Install required libraries if not already installed
# !pip install transformers torch datasets scikit-learn shap

import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.special import softmax
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix

# --- 1. Data Preparation ---
file_path = "/content/final_labels.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# If the CSV does not have 'toxicity_level' but has 'level_1', rename it.
if 'toxicity_level' not in df.columns and 'level_1' in df.columns:
    df = df.rename(columns={'level_1': 'toxicity_level'})

# Filter to required columns and drop missing values
df = df[['body', 'toxicity_level', 'split']].dropna()

# Encode the toxicity labels (available labels: ['Misogynistic', 'Nonmisogynistic'])
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['toxicity_level'])

# Print available labels to verify
print("Available labels:", list(label_encoder.classes_))
# Set the target label to one of the available labels (case-sensitive)
target_label = "Misogynistic"  # Use the exact available label

if target_label not in label_encoder.classes_:
    raise ValueError(f"The label '{target_label}' is not present in your dataset labels.")

# Determine the index of the target label
target_index = list(label_encoder.classes_).index(target_label)

# Split into training and testing datasets
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

# --- 2. Custom Dataset Class ---
class ToxicityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),   # Remove extra dimensions
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# --- 3. Model and Tokenizer Setup ---
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label_encoder.classes_)  # e.g., 2 for Misogynistic and Nonmisogynistic
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Create dataset objects
train_dataset = ToxicityDataset(train_df["body"], train_df["label"], tokenizer)
test_dataset = ToxicityDataset(test_df["body"], test_df["label"], tokenizer)

# --- 4. Metrics Calculation ---
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Compute basic metrics: accuracy, precision, recall, and f1
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)

    # Compute probabilities for the target class using softmax
    probabilities = softmax(pred.predictions, axis=1)[:, target_index]
    # Create binary labels: 1 if the true label equals target_index, else 0
    binary_labels = (labels == target_index).astype(int)

    try:
        roc_auc = roc_auc_score(binary_labels, probabilities)
    except ValueError:
        roc_auc = 0.0
    try:
        pr_auc = average_precision_score(binary_labels, probabilities)
    except ValueError:
        pr_auc = 0.0

    # Compute confusion matrix for binary classification
    # True: label == target_index, False otherwise
    cm = confusion_matrix(binary_labels, (preds == target_index).astype(int))
    if cm.shape == (2, 2):
        TN, FP, FN, TP = cm.ravel()
        fpr = FP / (FP + TN) if (FP + TN) > 0 else 0.0
        fnr = FN / (FN + TP) if (FN + TP) > 0 else 0.0
    else:
        fpr = 0.0
        fnr = 0.0

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
        "fpr": fpr,
        "fnr": fnr
    }

# --- 5. Training Arguments and Trainer ---
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# --- 6. Fine-Tune the Classifier ---
trainer.train()

# Evaluate the model on the test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
# Use .loc to avoid SettingWithCopyWarning
test_df.loc[:, 'predicted_label'] = pred_labels

# --- 7. Detect Toxic/Harmful Comments ---
try:
    toxic_label_index = list(label_encoder.classes_).index(target_label)
except ValueError:
    raise ValueError(f"The label '{target_label}' is not present in your dataset labels.")

toxic_comments_df = test_df[test_df['predicted_label'] == toxic_label_index]

print("Detected Toxic/Harmful Comments:")
for idx, row in toxic_comments_df.iterrows():
    print(f"Index: {idx}")
    print(row['body'])
    print("------")


Available labels: ['Misogynistic', 'Nonmisogynistic']


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc,Pr Auc,Fpr,Fnr
1,0.2497,0.334153,0.900846,0.811523,0.900846,0.853854,0.690686,0.202113,0.0,1.0
2,0.3025,0.345108,0.900846,0.811523,0.900846,0.853854,0.40321,0.084406,0.0,1.0
3,0.3864,0.34118,0.900846,0.811523,0.900846,0.853854,0.400425,0.08466,0.0,1.0


Detected Toxic/Harmful Comments:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:, 'predicted_label'] = pred_labels
