In [None]:
pip install torch transformers datasets textattack textblob matplotlib scikit-learn
from datasets import load_dataset
imdb_reviews = load_dataset("imdb")
train_reviews, train_labels = imdb_reviews['train']['text'], imdb_reviews['train']['label']
test_reviews, test_labels = imdb_reviews['test']['text'], imdb_reviews['test']['label']
print(f"Train reviews from IMDB: {len(train_reviews)}")
print(f"Test reviews from IMDB: {len(test_reviews)}")
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
import torch
from transformers import BertForSequenceClassification, pipeline
device = 0 if torch.cuda.is_available() else -1  # 0 = GPU, -1 = CPU

model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-imdb")
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=bert_tokenizer, device=device)

In [1]:
# Perform batched inference
from torch.utils.data import DataLoader, TensorDataset

batch_size = 32
tokenized_data = preprocess_texts(test_reviews, bert_tokenizer)

# Create DataLoader for batched processing
test_dataset = TensorDataset(
    tokenized_data['input_ids'], tokenized_data['attention_mask']
)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Evaluate the model
from tqdm import tqdm
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

all_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask = [b.to("cuda" if torch.cuda.is_available() else "cpu") for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

# Convert predictions to labels
all_preds_labels = ["POSITIVE" if pred == 1 else "NEGATIVE" for pred in all_preds]

In [None]:
from sklearn.metrics import accuracy_score
clean_accuracy = accuracy_score(test_labels, all_preds)
print(f"Clean data accuracy: {clean_accuracy * 100:.2f}%")

In [None]:
## DEFENSE

In [2]:
!pip install autocorrect

import re
import random
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification

# Ensure NLTK resources are downloaded
import nltk
nltk.download('punkt')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from autocorrect import Speller

Defaulting to user installation because normal site-packages is not writeable


2025-01-31 11:27:17.908565: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-31 11:27:18.051285: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-31 11:27:20.010865: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2025-01-31 11:27:20.010981

In [4]:
import os
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

# Function to load data
def load_data(directory):
    data = []
    labels = []
    for label_type in ['pos', 'neg']:
        dir_path = os.path.join(directory, label_type)
        for file in os.listdir(dir_path):
            with open(os.path.join(dir_path, file), 'r', encoding='utf-8') as f:
                data.append(f.read())
            labels.append(1 if label_type == 'pos' else 0)  # Positive: 1, Negative: 0
    return data, labels

# Load training and test data
train_dir = '/home/jovyan/DL/aclImdb/train'
test_dir = '/home/jovyan/DL/aclImdb/test'
train_reviews, train_labels = load_data(train_dir)
test_reviews, test_labels = load_data(test_dir)

# Slice the data to use only 1500 samples
train_reviews = train_reviews[:1500]
train_labels = train_labels[:1500]
test_reviews = test_reviews[:1500]
test_labels = test_labels[:1500]
print(f"Train reviews from IMDB: {len(train_reviews)}")
print(f"Test reviews from IMDB: {len(test_reviews)}")

Train reviews from IMDB: 1500
Test reviews from IMDB: 1500


In [5]:
def clean_text(text):
    """
    Clean and sanitize text data.
    - Converts to lowercase
    - Removes URLs, punctuation, and extra spaces
    - Corrects spelling errors
    """
    spell = Speller(lang='en')
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = spell(text)  # Correct spelling
    return text

def preprocess_data(texts):
    """
    Apply text cleaning to the dataset.
    """
    return [clean_text(text) for text in tqdm(texts, desc="Cleaning Data")]

In [6]:
from tqdm import tqdm
# Preprocess the data
train_reviews = preprocess_data(train_reviews)
test_reviews = preprocess_data(test_reviews)


Cleaning Data: 100%|██████████| 1500/1500 [18:43<00:00,  1.34it/s]
Cleaning Data: 100%|██████████| 1500/1500 [19:17<00:00,  1.30it/s]


In [7]:
def generate_pgd_adversarial_examples(model, tokenizer, text, label, epsilon=0.1, alpha=0.02, num_iter=5):
    """
    Generate adversarial examples using PGD on embeddings.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    model.eval()

    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    label_tensor = torch.tensor([label]).to(device)

    # Extract embeddings
    embeddings = model.bert.embeddings.word_embeddings(inputs["input_ids"]).detach().clone()
    perturbed_embeddings = embeddings.clone().detach().requires_grad_(True)

    for _ in range(num_iter):
        # Forward pass with perturbed embeddings
        attention_mask = inputs["attention_mask"]
        outputs = model(inputs_embeds=perturbed_embeddings, attention_mask=attention_mask, labels=label_tensor)
        loss = outputs.loss

        # Backward pass
        model.zero_grad()
        loss.backward()

        # Gradient step
        grad_sign = perturbed_embeddings.grad.sign()
        perturbed_embeddings = perturbed_embeddings + alpha * grad_sign

        # Project back to the epsilon-ball
        perturbation = torch.clamp(perturbed_embeddings - embeddings, -epsilon, epsilon)
        perturbed_embeddings = torch.clamp(embeddings + perturbation, -1, 1).detach().requires_grad_(True)

    # Convert perturbed embeddings back to tokens
    with torch.no_grad():
        logits = model(inputs_embeds=perturbed_embeddings, attention_mask=attention_mask).logits
        perturbed_input_ids = torch.argmax(logits, dim=-1)

    perturbed_text = tokenizer.decode(perturbed_input_ids[0], skip_special_tokens=True)
    return perturbed_text


In [8]:
def adversarial_training_with_augmentation(model, tokenizer, train_texts, train_labels, num_epochs=3, epsilon=0.1, alpha=0.02, num_iter=5):
    """
    Train a model with adversarial training.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    model.train()

    # Optimizer and loss function
    optimizer = optim.AdamW(model.parameters(), lr=5e-5)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for text, label in tqdm(zip(train_texts, train_labels), total=len(train_texts), desc=f"Epoch {epoch+1}/{num_epochs}"):
            # Generate adversarial examples
            adversarial_text = generate_pgd_adversarial_examples(model, tokenizer, text, label, epsilon, alpha, num_iter)

            # Tokenize original and adversarial text
            inputs_original = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            inputs_adversarial = tokenizer(adversarial_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            label_tensor = torch.tensor([label]).to(device)

            # Forward pass with original and adversarial inputs
            outputs_original = model(**inputs_original)
            outputs_adversarial = model(**inputs_adversarial)

            # Compute loss
            loss_original = loss_fn(outputs_original.logits, label_tensor)
            loss_adversarial = loss_fn(outputs_adversarial.logits, label_tensor)

            loss = (loss_original + loss_adversarial) / 2  # Combine losses

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch+1} Loss: {epoch_loss/len(train_texts):.4f}")

In [9]:
def evaluate_defense(model, tokenizer, test_texts, test_labels, epsilon=0.1, alpha=0.02, num_iter=5):
    """
    Evaluate the defense against adversarial examples.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    model.eval()

    clean_preds = []
    adv_preds = []
    attack_success_count = 0

    for text, label in tqdm(zip(test_texts, test_labels), total=len(test_texts), desc="Evaluating Defense"):
        # Clean prediction
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        logits = model(**inputs).logits
        clean_pred = torch.argmax(logits, dim=-1).item()
        clean_preds.append(clean_pred)

        # Generate adversarial example
        adv_text = generate_pgd_adversarial_examples(model, tokenizer, text, label, epsilon, alpha, num_iter)

        # Adversarial prediction
        adv_inputs = tokenizer(adv_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        adv_logits = model(**adv_inputs).logits
        adv_pred = torch.argmax(adv_logits, dim=-1).item()
        adv_preds.append(adv_pred)

        # Check if the attack was successful
        if clean_pred == label and adv_pred != label:
            attack_success_count += 1

    # Compute Metrics
    clean_accuracy = accuracy_score(test_labels, clean_preds) * 100
    adversarial_accuracy = accuracy_score(test_labels, adv_preds) * 100
    performance_drop = clean_accuracy - adversarial_accuracy
    attack_success_rate = (attack_success_count / len(test_texts)) * 100

    return clean_accuracy, adversarial_accuracy, performance_drop, attack_success_rate

In [10]:
model_name = "textattack/bert-base-uncased-SST-2"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [11]:
# Adversarial Training
adversarial_training_with_augmentation(model, tokenizer, train_reviews, train_labels, num_epochs=1, epsilon=0.1, alpha=0.02, num_iter=3)

# Evaluate Defense
clean_acc, adv_acc, perf_drop, attack_success_rate = evaluate_defense(
    model, tokenizer, test_reviews, test_labels, epsilon=0.1, alpha=0.02, num_iter=5
)

# Print Results
print("\n--- Defense Evaluation Metrics ---")
print(f"Clean Accuracy: {clean_acc:.2f}%")
print(f"Adversarial Accuracy: {adv_acc:.2f}%")
print(f"Performance Drop: {perf_drop:.2f}%")
print(f"Attack Success Rate: {attack_success_rate:.2f}%")

Epoch 1/1:   0%|          | 0/1500 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Epoch 1/1: 100%|██████████| 1500/1500 [07:22<00:00,  3.39it/s]


Epoch 1 Loss: 0.0013


Evaluating Defense: 100%|██████████| 1500/1500 [07:59<00:00,  3.13it/s]


--- Defense Evaluation Metrics ---
Clean Accuracy: 100.00%
Adversarial Accuracy: 100.00%
Performance Drop: 0.00%
Attack Success Rate: 0.00%



