# One-Pixel Attack on ResNet-18 (CIFAR-10)
## Part 3: Adversarial Attack Implementation

**Paper:** "One pixel attack for fooling deep neural networks" by Vargas & Kouichi (2019)

**Objective:** Implement a genetic algorithm-based one-pixel attack to fool ResNet-18 using CIFAR-10 dataset.

---

## HOUR 1: SETUP & ENVIRONMENT

### Step 1: Import Libraries

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import numpy as np
import matplotlib.pyplot as plt
import cv2
from pathlib import Path
import os
import json
from tqdm import tqdm

# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

### Step 2: Load Pre-trained ResNet-18

In [None]:
# Load pre-trained ResNet-18 with ImageNet weights
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
model = model.to(device)
model.eval()  # Set to evaluation mode

print("✓ ResNet-18 loaded successfully")
print(f"Model moved to device: {device}")

### Step 3: Setup Data Pipeline with CIFAR-10

In [None]:
# Define CIFAR-10 normalization (ImageNet stats)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Load CIFAR-10 test set
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"✓ CIFAR-10 test set loaded: {len(test_dataset)} images")
print(f"CIFAR-10 classes: {test_dataset.classes}")

### Step 4: Verify Baseline Accuracy on Clean Images

In [None]:
def evaluate_model_accuracy(model, data_loader, device, max_samples=1000):
    """Evaluate model accuracy on clean images"""
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted.cpu() == labels).sum().item()
            
            if total >= max_samples:
                break
    
    accuracy = (correct / total) * 100
    return accuracy, correct, total

# Test baseline accuracy
baseline_accuracy, correct, total = evaluate_model_accuracy(model, test_loader, device, max_samples=1000)
print(f"✓ Baseline accuracy on {total} clean CIFAR-10 images: {baseline_accuracy:.2f}%")

---

## HOUR 2: IMPLEMENT CORE ATTACK

### Step 5: Utility Functions for Attack

In [None]:
def apply_pixel_change(image_tensor, pixel_x, pixel_y, r, g, b, device):
    """
    Apply a pixel change to an image tensor.
    
    Args:
        image_tensor: Original image tensor (normalized, shape [3, 32, 32])
        pixel_x: X coordinate (0-31)
        pixel_y: Y coordinate (0-31)
        r, g, b: RGB values in range [0, 1] (normalized for model input)
        device: torch device
    
    Returns:
        Modified image tensor
    """
    modified_image = image_tensor.clone()
    # Normalize pixel values to [-2, 2] range (ImageNet normalization inverse)
    # R channel
    modified_image[0, pixel_y, pixel_x] = (r - 0.485) / 0.229
    # G channel
    modified_image[1, pixel_y, pixel_x] = (g - 0.456) / 0.224
    # B channel
    modified_image[2, pixel_y, pixel_x] = (b - 0.426) / 0.225
    
    return modified_image

def predict_class(model, image_tensor, device):
    """
    Get model prediction and confidence for an image.
    
    Args:
        model: PyTorch model
        image_tensor: Image tensor (shape [3, 32, 32])
        device: torch device
    
    Returns:
        predicted_class, confidence scores
    """
    with torch.no_grad():
        image_batch = image_tensor.unsqueeze(0).to(device)
        outputs = model(image_batch)
        probabilities = torch.softmax(outputs, dim=1)
        predicted_class = torch.argmax(outputs, dim=1).item()
    
    return predicted_class, probabilities.cpu().numpy()[0]

def fitness_function(model, image_tensor, original_class, device):
    """
    Calculate fitness score for attack (higher = better attack).
    We want to maximize confidence of a WRONG class.
    
    Args:
        model: PyTorch model
        image_tensor: Modified image tensor
        original_class: True class label
        device: torch device
    
    Returns:
        fitness_score (float)
    """
    predicted_class, probabilities = predict_class(model, image_tensor, device)
    
    # If prediction is correct, fitness is low
    if predicted_class == original_class:
        return 0.0
    
    # If prediction is wrong, return confidence of wrong class
    return probabilities[predicted_class]

print("✓ Utility functions defined successfully")

### Step 6: Implement Genetic Algorithm (Differential Evolution)

In [None]:
class DifferentialEvolutionAttack:
    """
    Differential Evolution based one-pixel attack.
    Each individual in population: [pixel_x, pixel_y, r, g, b]
    """
    
    def __init__(self, model, device, pop_size=400, max_iterations=300, F=0.8, CR=0.5):
        self.model = model
        self.device = device
        self.pop_size = pop_size
        self.max_iterations = max_iterations
        self.F = F  # Differential weight
        self.CR = CR  # Crossover probability
    
    def initialize_population(self):
        """
        Initialize random population.
        Each individual: [x, y, r, g, b]
        """
        population = []
        for _ in range(self.pop_size):
            x = np.random.randint(0, 32)
            y = np.random.randint(0, 32)
            r = np.random.uniform(0, 1)
            g = np.random.uniform(0, 1)
            b = np.random.uniform(0, 1)
            population.append(np.array([x, y, r, g, b]))
        return population
    
    def evaluate_population(self, population, image_tensor, original_class):
        """
        Evaluate fitness of all individuals in population.
        """
        fitness_scores = []
        for individual in population:
            x, y, r, g, b = individual
            modified_image = apply_pixel_change(image_tensor, int(x), int(y), r, g, b, self.device)
            fitness = fitness_function(self.model, modified_image, original_class, self.device)
            fitness_scores.append(fitness)
        
        return np.array(fitness_scores)
    
    def mutate(self, population, fitness_scores, best_individual):
        """
        DE/best/1 mutation strategy.
        """
        new_population = []
        
        for i in range(len(population)):
            # Select 3 random individuals
            indices = np.random.choice(len(population), 3, replace=False)
            a, b, c = population[indices[0]], population[indices[1]], population[indices[2]]
            
            # DE/best/1: v = best + F * (a - b)
            mutant = best_individual + self.F * (a - b)
            
            # Crossover
            if np.random.rand() < self.CR:
                trial = mutant.copy()
            else:
                trial = population[i].copy()
            
            # Clip values to valid ranges
            trial[0] = np.clip(trial[0], 0, 31)  # x
            trial[1] = np.clip(trial[1], 0, 31)  # y
            trial[2:5] = np.clip(trial[2:5], 0, 1)  # r, g, b
            
            new_population.append(trial)
        
        return new_population
    
    def attack(self, image_tensor, original_class):
        """
        Run the attack on a single image.
        
        Returns:
            success (bool), iterations (int), best_solution (array), best_fitness (float)
        """
        population = self.initialize_population()
        best_fitness_history = []
        
        for iteration in range(self.max_iterations):
            # Evaluate population
            fitness_scores = self.evaluate_population(population, image_tensor, original_class)
            best_idx = np.argmax(fitness_scores)
            best_individual = population[best_idx]
            best_fitness = fitness_scores[best_idx]
            best_fitness_history.append(best_fitness)
            
            # Check for successful attack
            if best_fitness > 0.5:  # Confidence threshold
                return True, iteration + 1, best_individual, best_fitness
            
            # Mutation and selection
            population = self.mutate(population, fitness_scores, best_individual)
        
        # Return best solution found (even if unsuccessful)
        best_idx = np.argmax(best_fitness_history)
        best_individual = population[best_idx]
        best_fitness = best_fitness_history[best_idx]
        
        return False, self.max_iterations, best_individual, best_fitness

print("✓ Differential Evolution Attack class defined successfully")

---

## HOUR 3: TEST & EVALUATE

### Step 7: Run Attack on Test Images

In [None]:
# Create output directories
Path('visualizations').mkdir(exist_ok=True)
Path('results').mkdir(exist_ok=True)

# Initialize attack
attack = DifferentialEvolutionAttack(model, device, pop_size=400, max_iterations=300)

# Run attack on first 10 images
num_images_to_test = 10
attack_results = []

test_data = []
for images, labels in test_loader:
    for i in range(len(images)):
        test_data.append((images[i], labels[i].item()))
    if len(test_data) >= num_images_to_test:
        break

print(f"Running attacks on {num_images_to_test} images...\n")

for idx, (image_tensor, true_label) in enumerate(test_data):
    print(f"[Image {idx+1}/{num_images_to_test}] Original class: {test_dataset.classes[true_label]}")
    
    # Run attack
    success, iterations, best_solution, best_fitness = attack.attack(image_tensor, true_label)
    
    # Apply best solution to get adversarial image
    x, y, r, g, b = best_solution
    adversarial_image = apply_pixel_change(image_tensor, int(x), int(y), r, g, b, device)
    
    # Get predictions
    pred_original, _ = predict_class(model, image_tensor, device)
    pred_adversarial, probs = predict_class(model, adversarial_image, device)
    
    # Store results
    result = {
        'image_idx': idx,
        'original_class': true_label,
        'predicted_original': pred_original,
        'predicted_adversarial': pred_adversarial,
        'attack_success': success,
        'iterations': iterations,
        'pixel_x': int(x),
        'pixel_y': int(y),
        'pixel_rgb': [r, g, b],
        'best_fitness': float(best_fitness)
    }
    attack_results.append(result)
    
    status = "✓ SUCCESS" if success else "✗ FAILED"
    print(f"  {status} | Iterations: {iterations} | Confidence: {best_fitness:.4f}")
    print(f"  Adversarial class: {test_dataset.classes[pred_adversarial]} (prob: {probs[pred_adversarial]:.4f})")
    print()

print("\n" + "="*60)
print("Attack Phase Complete!")

### Step 8: Analyze Results

In [None]:
# Calculate statistics
successful_attacks = [r for r in attack_results if r['attack_success']]
success_rate = (len(successful_attacks) / len(attack_results)) * 100

if successful_attacks:
    avg_iterations = np.mean([r['iterations'] for r in successful_attacks])
    avg_confidence = np.mean([r['best_fitness'] for r in successful_attacks])
else:
    avg_iterations = 0
    avg_confidence = 0

print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)
print(f"Total images tested: {len(attack_results)}")
print(f"Successful attacks: {len(successful_attacks)}/{len(attack_results)}")
print(f"Success Rate: {success_rate:.2f}%")
if successful_attacks:
    print(f"Average iterations (successful only): {avg_iterations:.1f}")
    print(f"Average confidence (successful only): {avg_confidence:.4f}")
print("="*60)

---

## HOUR 4: ANALYSIS & REPORTING

### Step 9: Visualize Attack Results

In [None]:
def denormalize_image(image_tensor):
    """
    Convert normalized tensor back to uint8 image.
    """
    # Denormalize
    image = image_tensor.cpu().numpy().transpose(1, 2, 0)
    image = (image * np.array([0.229, 0.224, 0.225])) + np.array([0.485, 0.456, 0.406])
    image = np.clip(image * 255, 0, 255).astype(np.uint8)
    return image

# Visualize first 5 successful attacks
fig, axes = plt.subplots(len(successful_attacks[:5]), 2, figsize=(10, 4*len(successful_attacks[:5])))

if len(successful_attacks) == 0:
    print("No successful attacks to visualize. Showing first 5 attempts instead.")
    results_to_show = attack_results[:5]
else:
    results_to_show = successful_attacks[:5]

for row, result in enumerate(results_to_show):
    img_idx = result['image_idx']
    image_tensor = test_data[img_idx][0]
    
    # Original image
    original_img = denormalize_image(image_tensor)
    
    # Adversarial image
    x, y, r, g, b = result['pixel_x'], result['pixel_y'], result['pixel_rgb'][0], result['pixel_rgb'][1], result['pixel_rgb'][2]
    adversarial_tensor = apply_pixel_change(image_tensor, int(x), int(y), r, g, b, device)
    adversarial_img = denormalize_image(adversarial_tensor)
    
    # Plot original
    axes[row, 0].imshow(original_img)
    axes[row, 0].set_title(f"Original: {test_dataset.classes[result['original_class']]}\n(Pred: {test_dataset.classes[result['predicted_original']]})")
    axes[row, 0].axis('off')
    
    # Plot adversarial with pixel highlighted
    adversarial_img_highlight = adversarial_img.copy()
    adversarial_img_highlight[max(0, y-2):min(32, y+3), max(0, x-2):min(32, x+3)] = [255, 0, 0]  # Red highlight
    axes[row, 1].imshow(adversarial_img_highlight)
    status = "SUCCESS" if result['attack_success'] else "FAILED"
    axes[row, 1].set_title(f"Adversarial [{status}]: {test_dataset.classes[result['predicted_adversarial']]}\nIter: {result['iterations']}, Conf: {result['best_fitness']:.3f}")
    axes[row, 1].axis('off')

plt.tight_layout()
plt.savefig('visualizations/attack_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to visualizations/attack_results.png")

### Step 10: Save Results to JSON

In [None]:
# Prepare summary statistics
summary = {
    'model': 'ResNet-18 (ImageNet weights)',
    'dataset': 'CIFAR-10',
    'total_images_tested': len(attack_results),
    'successful_attacks': len(successful_attacks),
    'success_rate_percent': success_rate,
    'baseline_accuracy_percent': baseline_accuracy,
    'attack_algorithm': 'Differential Evolution (DE/best/1)',
    'population_size': 400,
    'max_iterations': 300,
    'average_iterations_successful': float(avg_iterations) if successful_attacks else None,
    'average_confidence_successful': float(avg_confidence) if successful_attacks else None,
    'individual_results': attack_results
}

# Save to JSON
with open('results/attack_results.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("✓ Results saved to results/attack_results.json")

### Step 11: Generate Analysis Report

In [None]:
# Generate analysis report
report = f"""
{'='*70}
ONE-PIXEL ATTACK ANALYSIS REPORT
{'='*70}

MODEL & DATASET:
  - Model: ResNet-18 (ImageNet pre-trained weights)
  - Dataset: CIFAR-10 (32x32 RGB images, 10 classes)
  - Baseline accuracy on clean images: {baseline_accuracy:.2f}%

ATTACK CONFIGURATION:
  - Algorithm: Differential Evolution (DE/best/1)
  - Population size: 400
  - Maximum iterations: 300
  - Perturbation: 1 pixel only
  - Success threshold: Confidence > 0.5 on wrong class

RESULTS SUMMARY:
  - Total images tested: {len(attack_results)}
  - Successful attacks: {len(successful_attacks)}/{len(attack_results)}
  - Success Rate: {success_rate:.2f}%

{'='*70}
QUESTION 1: Success Rate Analysis
{'='*70}

What percentage of images can be successfully attacked?

Answer: {success_rate:.2f}% of tested images were successfully attacked
({len(successful_attacks)} out of {len(attack_results)} images).

Interpretation:
  - ResNet-18 is {"highly" if success_rate > 50 else "moderately" if success_rate > 25 else "not very"} vulnerable to one-pixel attacks
  - This demonstrates the adversarial fragility of deep neural networks
  - A single pixel modification can often fool the model despite high accuracy on clean data

{'='*70}
QUESTION 2: Efficiency Analysis
{'='*70}

How many iterations does the algorithm typically need?

Answer: """

if successful_attacks:
    iterations_list = [r['iterations'] for r in successful_attacks]
    report += f"""
Successful attacks required an average of {avg_iterations:.1f} iterations.

Statistics (for successful attacks):
  - Minimum iterations: {min(iterations_list)}
  - Maximum iterations: {max(iterations_list)}
  - Mean iterations: {np.mean(iterations_list):.1f}
  - Median iterations: {np.median(iterations_list):.1f}
  - Std deviation: {np.std(iterations_list):.1f}

Interpretation:
  - The algorithm is relatively efficient
  - Most attacks converge within the first {max(iterations_list)} iterations
  - This suggests the adversarial space is easily exploitable
"""
else:
    report += """
No successful attacks achieved. The algorithm did not find sufficient adversarial perturbations
within the iteration limit. This may require:
  - Increased population size
  - More iterations
  - Adjusted hyperparameters (F, CR)
"""

report += f"""
{'='*70}
QUESTION 3: Pattern Recognition
{'='*70}

Are certain types of images more vulnerable?

Analysis based on {len(attack_results)} tested images:
"""

# Analyze by class
class_results = {}
for r in attack_results:
    orig_class = test_dataset.classes[r['original_class']]
    if orig_class not in class_results:
        class_results[orig_class] = {'total': 0, 'successful': 0}
    class_results[orig_class]['total'] += 1
    if r['attack_success']:
        class_results[orig_class]['successful'] += 1

report += "\n  Success rate by original class:\n"
for class_name in sorted(class_results.keys()):
    stats = class_results[class_name]
    if stats['total'] > 0:
        class_success_rate = (stats['successful'] / stats['total']) * 100
        report += f"    - {class_name:10s}: {class_success_rate:5.1f}% ({stats['successful']}/{stats['total']})\n"

report += """
  Observations:
    - Some object classes may be more susceptible to adversarial perturbations
    - Simpler classes (e.g., airplanes) might be less robust than complex ones (e.g., dogs)
    - The model's learned features for certain classes may be more brittle

"""

report += f"""
{'='*70}
CONCLUSIONS
{'='*70}

1. VULNERABILITY:
   ResNet-18 demonstrates significant vulnerability to one-pixel attacks,
   with {success_rate:.1f}% success rate. This highlights the adversarial fragility
   of deep neural networks despite strong performance on clean data.

2. EFFICIENCY:
   The Differential Evolution algorithm efficiently finds adversarial pixels,
   typically requiring ~{avg_iterations:.0f} iterations on average.

3. IMPLICATIONS:
   - Deep networks can be fooled by imperceptible perturbations
   - Defense mechanisms are critical for real-world deployment
   - The gap between human perception and ML perception is significant

4. NEXT STEPS:
   - Implement defense mechanisms (adversarial training, pixel resilience)
   - Test on larger datasets
   - Analyze decision boundaries
   - Evaluate certified robustness

{'='*70}
"""

print(report)

# Save report
with open('results/analysis_report.txt', 'w') as f:
    f.write(report)

print("\n✓ Analysis report saved to results/analysis_report.txt")

### Step 12: Generate Final Summary

In [None]:
print("\n" + "="*70)
print("PART 3: ONE-PIXEL ATTACK - IMPLEMENTATION COMPLETE")
print("="*70)
print(f"\nKey Results:")
print(f"  ✓ Baseline Model Accuracy (clean): {baseline_accuracy:.2f}%")
print(f"  ✓ Attack Success Rate: {success_rate:.2f}%")
if successful_attacks:
    print(f"  ✓ Average Iterations to Success: {avg_iterations:.1f}")
print(f"\nOutputs Generated:")
print(f"  ✓ visualizations/attack_results.png")
print(f"  ✓ results/attack_results.json")
print(f"  ✓ results/analysis_report.txt")
print(f"\nNext Steps:")
print(f"  1. Review the analysis_report.txt for detailed findings")
print(f"  2. Check attack_results.png for visual examples")
print(f"  3. (Optional) Run extended evaluation on more images")
print(f"  4. (Optional) Implement defense mechanisms (Task 4)")
print("="*70)