In [21]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import re
import os
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
EPOCHS = 1
Data_dir="./jigsaw-dataset"
WORK_DIR = "../working/"
num_to_load=1000000                         #Train size to match time limit
valid_size= 100000                          #Validation Size
TOXICITY_COLUMN = 'target'

In [4]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [16]:
# Load the training data
train = pd.read_csv(os.path.join(Data_dir, "train.csv"))

# Clean the text data
train['comment_text'] = train['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n', ' ', regex=True)

# Add class labels (binary: 0 for non-toxic, 1 for toxic)
train['label'] = np.where(train['target'] >= 0.5, 1, 0)

# Retain only necessary columns
train = train[['id', 'comment_text', 'label']]

# Sample a 10% subset for demonstration
train_subset = train.sample(frac=0.1, random_state=42)

# Split into training and validation sets (80/20 split)
train_data, val_data = train_test_split(train_subset, test_size=0.2, stratify=train_subset['label'], random_state=42)

# Print data stats
print(f"Subset Training set size: {len(train_data)}")
print(f"Subset Validation set size: {len(val_data)}")

# Save the sampled training and validation sets
train_data.to_csv("train_subset.csv", index=False)
val_data.to_csv("val_subset.csv", index=False)

Subset Training set size: 144389
Subset Validation set size: 36098


In [18]:
# Drop rows with null comment_text
train_data = train_data.dropna(subset=['comment_text'])
val_data = val_data.dropna(subset=['comment_text'])

# Ensure all values are strings
train_data['comment_text'] = train_data['comment_text'].astype(str)
val_data['comment_text'] = val_data['comment_text'].astype(str)

# Check data again
print("Number of null values after cleaning:")
print("Train set:", train_data['comment_text'].isnull().sum())
print("Validation set:", val_data['comment_text'].isnull().sum())



Number of null values after cleaning:
Train set: 0
Validation set: 0


In [19]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize data function
def tokenize_data(data, text_column, label_column):
    encodings = tokenizer(
        list(data[text_column]),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    labels = data[label_column].values
    return encodings, labels

# Tokenize training and validation data
train_encodings, train_labels = tokenize_data(train_data, "comment_text", "label")
val_encodings, val_labels = tokenize_data(val_data, "comment_text", "label")


In [25]:
import torch
from torch.utils.data import Dataset, DataLoader

class ToxicCommentsDataset(Dataset):
    def __init__(self, ids, encodings, labels):
        self.ids = ids  # Store ids for reference
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Only return id for reference, not for the model
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Add labels for the model
        item['id'] = self.ids[idx]  # Keep 'id' for external use only
        return item


# Create datasets
train_dataset = ToxicCommentsDataset(train_data['id'].values, train_encodings, train_labels)
val_dataset = ToxicCommentsDataset(val_data['id'].values, val_encodings, val_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [29]:
from transformers import BertForSequenceClassification, AdamW, get_scheduler
from tqdm import tqdm

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
epochs = 1
model.train()

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    loop = tqdm(train_loader, desc="Training", leave=True)
    for batch in loop:
        # Move inputs to the device
        batch = {k: v.to(device) for k, v in batch.items() if k != "id"}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        # Update tqdm with the current loss
        loop.set_postfix(loss=loss.item())



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/1


Training: 100%|██████████| 9025/9025 [51:58<00:00,  2.89it/s, loss=0.811]  


In [30]:
from tqdm import tqdm
from sklearn.metrics import classification_report

# Evaluation loop
model.eval()
val_predictions, val_true_labels, val_ids = [], [], []  # Include val_ids to track original IDs

with torch.no_grad():
    loop = tqdm(val_loader, desc="Evaluating", leave=True)
    for batch in loop:
        # Move inputs to the device, exclude 'id'
        batch_inputs = {k: v.to(device) for k, v in batch.items() if k != "id"}
        outputs = model(**batch_inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        # Collect predictions, true labels, and IDs
        val_predictions.extend(preds.cpu().numpy())
        val_true_labels.extend(batch['labels'].cpu().numpy())
        val_ids.extend(batch['id'].cpu().numpy())  # Track the original IDs

# Generate classification report
print(classification_report(val_true_labels, val_predictions, target_names=["Non-Toxic", "Toxic"]))

# Add predictions and true labels to the validation DataFrame for analysis
val_data['id'] = val_ids
val_data['true_label'] = val_true_labels
val_data['predicted_label'] = val_predictions
val_data['predicted_label'] = val_data['predicted_label'].map({0: "non-toxic", 1: "toxic"})

# Save validation results
val_data.to_csv("val_with_predictions.csv", index=False)
print("Validation predictions saved to 'val_with_predictions.csv'.")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Evaluating: 100%|██████████| 2257/2257 [04:38<00:00,  8.10it/s]


              precision    recall  f1-score   support

   Non-Toxic       0.96      0.98      0.97     33207
       Toxic       0.73      0.55      0.63      2891

    accuracy                           0.95     36098
   macro avg       0.85      0.77      0.80     36098
weighted avg       0.94      0.95      0.94     36098

Validation predictions saved to 'val_with_predictions.csv'.


In [31]:
# Load original dataset
original_data = pd.read_csv(os.path.join(Data_dir, "train.csv"))

# Load validation predictions
val_predictions = pd.read_csv("val_with_predictions.csv")

In [33]:
# Merge validation predictions with the original dataset
merged_data = val_predictions.merge(original_data, on="id", how="left")

In [34]:
# Define subgroup columns
subgroups = [
    "asian", "atheist", "bisexual", "black", "buddhist", "christian",
    "female", "heterosexual", "hindu", "homosexual_gay_or_lesbian",
    "intellectual_or_learning_disability", "jewish", "latino", "male",
    "muslim", "other_disability", "other_gender", "other_race_or_ethnicity",
    "other_religion", "other_sexual_orientation", "physical_disability",
    "psychiatric_or_mental_illness", "transgender", "white"
]


In [38]:
# Initialize dictionary to store results
subgroup_accuracies = {}

# Calculate accuracy for each subgroup
for subgroup in subgroups:
    # Filter rows where the subgroup column is 1 (indicating membership in the subgroup)
    subgroup_data = merged_data[merged_data[subgroup] == 1]
    
    if not subgroup_data.empty:
        # Calculate accuracy: (correct predictions / total predictions for the subgroup)
        correct_predictions = (subgroup_data['true_label'] == subgroup_data['predicted_label']).sum()
        total_predictions = len(subgroup_data)
        accuracy = correct_predictions / total_predictions
        subgroup_accuracies[subgroup] = {
            "accuracy": accuracy,
            "support": total_predictions
        }
    else:
        # Handle cases where there are no members in the subgroup
        subgroup_accuracies[subgroup] = {
            "accuracy": None,
            "support": 0
        }

# Convert results to a DataFrame for better visualization
subgroup_accuracies_df = pd.DataFrame.from_dict(subgroup_accuracies, orient="index")
subgroup_accuracies_df.reset_index(inplace=True)
subgroup_accuracies_df.columns = ["Subgroup", "Accuracy", "Support"]

# Sort the DataFrame by accuracy
subgroup_accuracies_df = subgroup_accuracies_df.sort_values(by="Accuracy", ascending=False)

# Display the results
print(subgroup_accuracies_df)

# Save the results to a CSV file
subgroup_accuracies_df.to_csv("subgroup_accuracies.csv", index=False)
print("Subgroup accuracies saved to 'subgroup_accuracies.csv'.")

                               Subgroup  Accuracy  Support
0                                 asian       0.0       19
1                               atheist       0.0        6
2                              bisexual       0.0        1
3                                 black       0.0      171
4                              buddhist       0.0        1
5                             christian       0.0      335
6                                female       0.0      593
7                          heterosexual       0.0        7
8                                 hindu       0.0        3
9             homosexual_gay_or_lesbian       0.0      100
11                               jewish       0.0       74
12                               latino       0.0       15
13                                 male       0.0      413
14                               muslim       0.0      266
18                       other_religion       0.0        2
21        psychiatric_or_mental_illness       0.0       

In [41]:
# Check the unique values of true_label and predicted_label
print("Unique values in true_label:", merged_data['true_label'].unique())
print("Unique values in predicted_label:", merged_data['predicted_label'].unique())

Unique values in true_label: [0 1]
Unique values in predicted_label: ['non-toxic' 'toxic']


In [42]:
# Ensure labels are numeric
merged_data['true_label'] = merged_data['true_label'].map({"non-toxic": 0, "toxic": 1})
merged_data['predicted_label'] = merged_data['predicted_label'].map({"non-toxic": 0, "toxic": 1})

In [43]:
# Check the counts of members in each subgroup
subgroup_counts = merged_data[subgroups].sum()
print(subgroup_counts)


asian                                    80.784453
atheist                                  19.800560
bisexual                                 14.468329
black                                   300.233333
buddhist                                  9.066667
christian                               766.314440
female                                 1047.201103
heterosexual                             29.401111
hindu                                    13.383893
homosexual_gay_or_lesbian               212.602765
intellectual_or_learning_disability       8.571671
jewish                                  143.715631
latino                                   51.514194
male                                    939.515413
muslim                                  407.021212
other_disability                          8.553876
other_gender                              6.668338
other_race_or_ethnicity                  71.263584
other_religion                           57.530384
other_sexual_orientation       

In [44]:
# Debug accuracy for the 'asian' subgroup
asian_data = merged_data[merged_data['asian'] == 1]
print("Number of samples in 'asian' subgroup:", len(asian_data))
print("Correctly predicted samples:", (asian_data['true_label'] == asian_data['predicted_label']).sum())


Number of samples in 'asian' subgroup: 19
Correctly predicted samples: 0


In [48]:
# Recalculate subgroup accuracies
subgroup_accuracies = {}

for subgroup in subgroups:
    subgroup_data = merged_data[merged_data[subgroup] == 1]
    if not subgroup_data.empty:
        correct_predictions = (subgroup_data['label'] == subgroup_data['predicted_label']).sum()
        total_predictions = len(subgroup_data)
        accuracy = correct_predictions / total_predictions
        subgroup_accuracies[subgroup] = {
            "accuracy": accuracy,
            "support": total_predictions
        }
    else:
        subgroup_accuracies[subgroup] = {
            "accuracy": None,
            "support": 0
        }

# Convert to DataFrame
subgroup_accuracies_df = pd.DataFrame.from_dict(subgroup_accuracies, orient="index")
subgroup_accuracies_df.reset_index(inplace=True)
subgroup_accuracies_df.columns = ["Subgroup", "Accuracy", "Support"]
print(subgroup_accuracies_df)


                               Subgroup  Accuracy  Support
0                                 asian  0.789474       19
1                               atheist  0.833333        6
2                              bisexual  1.000000        1
3                                 black  0.725146      171
4                              buddhist  1.000000        1
5                             christian  0.901493      335
6                                female  0.917369      593
7                          heterosexual  0.857143        7
8                                 hindu  1.000000        3
9             homosexual_gay_or_lesbian  0.770000      100
10  intellectual_or_learning_disability       NaN        0
11                               jewish  0.878378       74
12                               latino  0.933333       15
13                                 male  0.907990      413
14                               muslim  0.830827      266
15                     other_disability       NaN       

In [50]:
# Initialize a dictionary to store subgroup FPRs
subgroup_fpr = {}

for subgroup in subgroups:
    # Filter rows where the subgroup column is 1
    subgroup_data = merged_data[merged_data[subgroup] == 1]
    
    if not subgroup_data.empty:
        # Calculate False Positives (FP) and True Negatives (TN)
        fp = ((subgroup_data['label'] == 0) & (subgroup_data['predicted_label'] == 1)).sum()
        tn = ((subgroup_data['label'] == 0) & (subgroup_data['predicted_label'] == 0)).sum()
        
        # Calculate FPR: FP / (FP + TN)
        actual_negatives = fp + tn
        fpr = fp / actual_negatives if actual_negatives > 0 else None
        subgroup_fpr[subgroup] = {
            "FPR": fpr,
            "Support": actual_negatives
        }
    else:
        # Handle cases where there are no members of the subgroup
        subgroup_fpr[subgroup] = {
            "FPR": None,
            "Support": 0
        }

# Convert to DataFrame for visualization
subgroup_fpr_df = pd.DataFrame.from_dict(subgroup_fpr, orient="index")
subgroup_fpr_df.reset_index(inplace=True)
subgroup_fpr_df.columns = ["Subgroup", "FPR", "Support"]

# Sort by FPR
subgroup_fpr_df = subgroup_fpr_df.sort_values(by="FPR", ascending=False)

# Display the results
print(subgroup_fpr_df)

# Save results to a CSV
subgroup_fpr_df.to_csv("subgroup_fpr.csv", index=False)
print("Subgroup FPR saved to 'subgroup_fpr.csv'.")


                               Subgroup       FPR  Support
7                          heterosexual  0.166667        6
3                                 black  0.121495      107
22                          transgender  0.066667       15
9             homosexual_gay_or_lesbian  0.054054       74
23                                white  0.042105      190
21        psychiatric_or_mental_illness  0.035714       28
13                                 male  0.031977      344
5                             christian  0.026936      297
14                               muslim  0.019608      204
6                                female  0.017308      520
1                               atheist  0.000000        5
0                                 asian  0.000000       15
2                              bisexual  0.000000        1
4                              buddhist  0.000000        1
8                                 hindu  0.000000        3
12                               latino  0.000000       

In [51]:
import matplotlib.pyplot as plt

# Plot subgroup FPRs
plt.figure(figsize=(12, 8))
plt.bar(subgroup_fpr_df['Subgroup'], subgroup_fpr_df['FPR'], color='lightcoral')
plt.xlabel('Subgroup', fontsize=14)
plt.ylabel('False Positive Rate (FPR)', fontsize=14)
plt.title('Subgroup False Positive Rates', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

In [52]:
# Directory to save the model
output_dir = "bert_model_checkpoint"

# Save the model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to '{output_dir}'")

Model and tokenizer saved to 'bert_model_checkpoint'
