# Libraries

In [None]:
# Standard Libraries
import re
import statistics as stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Deep Learning and Data Handling
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Data Augmentation (will be used later)
# We can use nlpaug, but for now, let's focus on a simple synonym replacement as an example.
from nltk.corpus import wordnet

# Check for GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Define device
device = 'cuda' if len(tf.config.experimental.list_physical_devices('GPU')) > 0 else 'cpu'
print(f"Using device: {device}")

# Dataset input

In [None]:
# Load the dataset
file_path = "/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv"
data = pd.read_csv(file_path)

# Display initial information
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())

# Data pre-processing

In [None]:
# Drop unnecessary column
data_cleaned = data.drop(columns=['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'])

# Rename 'class' column for clarity
data_cleaned.rename(columns={'class': 'label'}, inplace=True)

# Map numeric labels to descriptive labels
label_mapping = {0: 'Hate Speech', 1: 'Offensive Language', 2: 'Neither'}
data_cleaned['label_desc'] = data_cleaned['label'].map(label_mapping)

print("\nCleaned Data Head:")
print(data_cleaned.head())
print("\nValue Counts for labels:")
print(data_cleaned['label_desc'].value_counts())

In [None]:
# Visualize class distribution
plt.figure(figsize=(8, 6))
data_cleaned['label_desc'].value_counts().plot(kind='bar', color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.xticks(rotation=0)
plt.show()

In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove special characters, numbers and punctuation, but keep spaces
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    # Join tokens back into a string
    return " ".join(filtered_tokens)

# Apply cleaning function to the tweet column
data_cleaned['tweet'] = data_cleaned['tweet'].apply(clean_text)

print("\nData after cleaning:")
print(data_cleaned.head())

In [None]:
def get_synonyms(word):
    """Get a list of synonyms for a given word."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").lower()
            if synonym != word:
                synonyms.add(synonym)
    return list(synonyms)

def augment_data(text, num_augmentations=1):
    """
    Augments a text by randomly replacing a word with a synonym.
    
    Args:
        text (str): The input text.
        num_augmentations (int): The number of new augmented samples to create.
        
    Returns:
        list: A list of augmented texts.
    """
    words = text.split()
    augmented_texts = []
    
    for _ in range(num_augmentations):
        new_words = list(words)
        
        # Choose a random word to replace
        if not new_words:
            continue
            
        random_word_idx = np.random.randint(0, len(new_words))
        random_word = new_words[random_word_idx]
        
        # Get synonyms and replace if any are found
        synonyms = get_synonyms(random_word)
        if synonyms:
            random_synonym = np.random.choice(synonyms)
            new_words[random_word_idx] = random_synonym
        
        augmented_texts.append(" ".join(new_words))
        
    return augmented_texts

# Separate the minority class
hate_speech_data = data_cleaned[data_cleaned['label'] == 0].copy()

# Augment the hate speech data
augmented_samples = []
for index, row in hate_speech_data.iterrows():
    augmented_texts = augment_data(row['tweet'], num_augmentations=3) # Create 3 new samples per original text
    for aug_text in augmented_texts:
        augmented_samples.append({'tweet': aug_text, 'label': 0, 'label_desc': 'Hate Speech'})

# Convert augmented samples to a DataFrame and concatenate
augmented_df = pd.DataFrame(augmented_samples)
data_augmented = pd.concat([data_cleaned, augmented_df], ignore_index=True)

# Check the new class distribution
print("\nClass Distribution after Augmentation:")
print(data_augmented['label_desc'].value_counts())

# Balancing the dataset

In [None]:
# Separate the dataframe into classes
df_hate_speech = data_augmented[data_augmented['label'] == 0].copy()
df_offensive_lang = data_augmented[data_augmented['label'] == 1].copy()
df_neither = data_augmented[data_augmented['label'] == 2].copy()

# Determine the target count (size of the majority class)
target_count = len(df_offensive_lang)

print(f"Target count for each class: {target_count}")

# Function to augment a class to a target count
def augment_to_target(df_class, target):
    if len(df_class) >= target:
        return df_class.sample(n=target, replace=False, random_state=42)
    
    current_count = len(df_class)
    augmented_samples = []

    # Calculate how many new samples are needed
    samples_to_add = target - current_count
    
    # Randomly select samples from the current class to augment
    samples_to_augment = df_class.sample(n=samples_to_add, replace=True, random_state=42)

    for index, row in samples_to_augment.iterrows():
        # Using our previously defined augment_data function
        new_text = augment_data(row['tweet'], num_augmentations=1)[0]
        augmented_samples.append({
            'tweet': new_text,
            'label': row['label'],
            'label_desc': row['label_desc']
        })

    # Concatenate the original and augmented data
    augmented_df = pd.DataFrame(augmented_samples)
    return pd.concat([df_class, augmented_df], ignore_index=True)

# Augment the "Hate Speech" and "Neither" classes
balanced_hate_speech = augment_to_target(df_hate_speech, target_count)
balanced_neither = augment_to_target(df_neither, target_count)

# Combine all the classes into a single balanced dataset
data_balanced = pd.concat([df_offensive_lang, balanced_hate_speech, balanced_neither], ignore_index=True)

# Shuffle the dataframe to ensure the data is randomly distributed
data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nFinal Class Distribution after Strategic Augmentation:")
print(data_balanced['label_desc'].value_counts())

# Visualize the balanced class distribution
plt.figure(figsize=(8, 6))
data_balanced['label_desc'].value_counts().plot(kind='bar', color=['skyblue', 'salmon', 'lightgreen'])
plt.title('Class Distribution after Strategic Oversampling')
plt.xlabel('Class')
plt.ylabel('Number of Samples')
plt.xticks(rotation=0)
plt.show()

# Final check of the dataset size
print(f"\nTotal samples in balanced dataset: {len(data_balanced)}")

# Model building

In [None]:
# Install the transformers library if you haven't already
!pip install transformers

# Import necessary libraries
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split the balanced dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data_balanced['tweet'],
    data_balanced['label'],
    test_size=0.2,
    random_state=42,
    stratify=data_balanced['label']  # Stratify to maintain class distribution in splits
)

# One-hot encode the labels
y_train_encoded = to_categorical(y_train, num_classes=3)
y_test_encoded = to_categorical(y_test, num_classes=3)
max_seq_len=128

# Dataset spliting

In [None]:
def tokenize_data(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,      # Add '[CLS]' and '[SEP]'
            max_length=max_len,           # Pad/truncate to this length
            padding='max_length',         # Pad to max_length
            return_attention_mask=True,   # Return attention mask
            return_tensors='tf',          # Return TensorFlow tensors
            truncation=True               # Truncate sequences longer than max_length
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    return input_ids, attention_masks

# Tokenize training and test sets
X_train_input_ids, X_train_attention_masks = tokenize_data(X_train.tolist(), tokenizer, max_seq_len)
X_test_input_ids, X_test_attention_masks = tokenize_data(X_test.tolist(), tokenizer, max_seq_len)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': X_train_input_ids, 'attention_mask': X_train_attention_masks}, y_train_encoded)
).shuffle(100).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices(
    ({'input_ids': X_test_input_ids, 'attention_mask': X_test_attention_masks}, y_test_encoded)
).batch(32)

# Load BERT model

In [None]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW

# Load PyTorch BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3
)

# Optimizer (AdamW is the recommended optimizer for BERT)
optimizer = AdamW(model.parameters(), lr=3e-5)

print("BERT PyTorch model loaded successfully.")

# Tokenizing

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Function to tokenize and prepare data for BERT
def prepare_data(texts, labels, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)

    return input_ids, attention_masks, labels

# Assuming X_train, y_train, X_test, y_test are already available from your previous steps
# The data must be in pandas Series or numpy arrays for this to work
train_input_ids, train_attention_masks, train_labels = prepare_data(X_train, y_train, tokenizer)
test_input_ids, test_attention_masks, test_labels = prepare_data(X_test, y_test, tokenizer)

# Create the PyTorch datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

# Create DataLoaders for batching and shuffling
batch_size = 32
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset), # Shuffles the data during training
    batch_size=batch_size
)
validation_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset), # Does not shuffle the data for validation
    batch_size=batch_size
)

print("DataLoaders prepared.")

# Model training

In [None]:
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

# Training parameters
epochs = 10   # Increased epochs since early stopping will stop earlier if needed
patience = 3  # Number of epochs with no improvement after which training will stop
best_val_loss = float("inf")
patience_counter = 0

total_steps = len(train_dataloader) * epochs
optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = CrossEntropyLoss()

# Trackers for plotting
train_losses = []
val_losses = []
val_accuracies = []

# Training loop with Early Stopping
for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
    print("Training...")

    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        if step % 50 == 0 and step != 0:
            avg_train_loss = total_train_loss / (step + 1)
            print(f"  Batch {step} of {len(train_dataloader)}. Loss: {avg_train_loss:.4f}")

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"\n  Average training loss: {avg_train_loss:.4f}")

    # Validation
    print("\nRunning Validation...")
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    all_preds = []
    all_labels = []

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

        total_eval_loss += loss.item()
        
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        labels = b_labels.cpu().numpy()
        
        all_preds.extend(preds)
        all_labels.extend(labels)
        
        accuracy = np.sum(preds == labels) / len(labels)
        total_eval_accuracy += accuracy

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)

    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_accuracy)

    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Validation Accuracy: {avg_val_accuracy:.4f}")

    # ---- Early Stopping Logic ----
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pt")  # Save best model
        print("  Validation loss improved, model saved!")
    else:
        patience_counter += 1
        print(f"  No improvement in validation loss. Patience counter: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("\nEarly stopping triggered! Training stopped.")
            break

print("\nTraining complete!")

# Report generation

In [None]:
# Classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds))

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=np.unique(all_labels), 
            yticklabels=np.unique(all_labels))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot Loss & Accuracy curves
epochs_range = range(1, len(train_losses) + 1)  # Use actual completed epochs

plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(epochs_range, train_losses, label='Training Loss')
plt.plot(epochs_range, val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Curve')
plt.legend()

plt.subplot(1,2,2)
plt.plot(epochs_range, val_accuracies, label='Validation Accuracy', color='green')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Validation Accuracy')
plt.legend()

plt.show()
