# Lab 17: Adversarial Machine Learning

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/depalmar/ai_for_the_win/blob/main/notebooks/lab17_adversarial_ml.ipynb)

Attack and defend machine learning security models.

## Learning Objectives
- Evasion attacks (FGSM, PGD)
- Data poisoning attacks
- Adversarial training
- Robust defense strategies

In [None]:
!pip install numpy scikit-learn matplotlib -q

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 1. Train a Simple Malware Classifier

In [None]:
# Generate synthetic malware features
np.random.seed(42)

# Features: [file_size, entropy, imports_count, suspicious_api_calls]
n_samples = 500

# Benign files
benign = np.random.randn(n_samples // 2, 4) * [100, 0.5, 20, 2] + [500, 5, 50, 5]

# Malware
malware = np.random.randn(n_samples // 2, 4) * [50, 0.3, 10, 5] + [200, 7, 30, 20]

X = np.vstack([benign, malware])
y = np.array([0] * (n_samples // 2) + [1] * (n_samples // 2))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

print(f"Original Accuracy: {accuracy_score(y_test, clf.predict(X_test)):.2%}")

## 2. Evasion Attack: FGSM

In [None]:
def fgsm_attack(model, X, y, epsilon=0.1):
    """
    Fast Gradient Sign Method attack.
    Perturb features to evade detection.
    """
    # Get model coefficients (gradient direction)
    gradient = model.coef_[0]
    
    # For malware (y=1), we want to move towards benign classification
    # This means moving against the gradient
    perturbation = -np.sign(gradient) * epsilon
    
    # Apply perturbation only to malware samples
    X_adv = X.copy()
    X_adv[y == 1] += perturbation
    
    return X_adv

# Attack the classifier
X_test_adv = fgsm_attack(clf, X_test, y_test, epsilon=0.5)

print(f"Accuracy on original: {accuracy_score(y_test, clf.predict(X_test)):.2%}")
print(f"Accuracy on adversarial: {accuracy_score(y_test, clf.predict(X_test_adv)):.2%}")

# Check evasion rate for malware
malware_mask = y_test == 1
original_detection = (clf.predict(X_test[malware_mask]) == 1).mean()
adversarial_detection = (clf.predict(X_test_adv[malware_mask]) == 1).mean()
print(f"\nMalware detection rate: {original_detection:.2%} -> {adversarial_detection:.2%}")
print(f"Evasion success: {original_detection - adversarial_detection:.2%}")

## 3. Data Poisoning Attack

In [None]:
def poison_training_data(X_train, y_train, poison_rate=0.1):
    """
    Poison training data by flipping labels.
    """
    X_poisoned = X_train.copy()
    y_poisoned = y_train.copy()
    
    n_poison = int(len(y_train) * poison_rate)
    poison_indices = np.random.choice(len(y_train), n_poison, replace=False)
    
    # Flip labels
    y_poisoned[poison_indices] = 1 - y_poisoned[poison_indices]
    
    return X_poisoned, y_poisoned, poison_indices

# Poison the training data
X_poisoned, y_poisoned, _ = poison_training_data(X_train, y_train, poison_rate=0.15)

# Train on poisoned data
clf_poisoned = LogisticRegression(random_state=42)
clf_poisoned.fit(X_poisoned, y_poisoned)

print(f"Original model accuracy: {accuracy_score(y_test, clf.predict(X_test)):.2%}")
print(f"Poisoned model accuracy: {accuracy_score(y_test, clf_poisoned.predict(X_test)):.2%}")

## 4. Defense: Adversarial Training

In [None]:
def adversarial_training(X_train, y_train, epsilon=0.3, n_iterations=3):
    """
    Train model with adversarial examples included.
    """
    X_aug = X_train.copy()
    y_aug = y_train.copy()
    
    for i in range(n_iterations):
        # Train model on current data
        model = LogisticRegression(random_state=42)
        model.fit(X_aug, y_aug)
        
        # Generate adversarial examples
        X_adv = fgsm_attack(model, X_train, y_train, epsilon=epsilon)
        
        # Augment training data
        X_aug = np.vstack([X_aug, X_adv])
        y_aug = np.concatenate([y_aug, y_train])
    
    # Final model
    final_model = LogisticRegression(random_state=42)
    final_model.fit(X_aug, y_aug)
    
    return final_model

# Train robust model
robust_clf = adversarial_training(X_train, y_train)

# Test against adversarial examples
X_test_adv = fgsm_attack(clf, X_test, y_test, epsilon=0.5)

print("Original Model:")
print(f"  Clean accuracy: {accuracy_score(y_test, clf.predict(X_test)):.2%}")
print(f"  Adversarial accuracy: {accuracy_score(y_test, clf.predict(X_test_adv)):.2%}")

print("\nRobust Model (Adversarial Training):")
print(f"  Clean accuracy: {accuracy_score(y_test, robust_clf.predict(X_test)):.2%}")
print(f"  Adversarial accuracy: {accuracy_score(y_test, robust_clf.predict(X_test_adv)):.2%}")

## 5. Defense: Ensemble Methods

In [None]:
# Random Forest is more robust to adversarial perturbations
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

print("Random Forest Defense:")
print(f"  Clean accuracy: {accuracy_score(y_test, rf_clf.predict(X_test)):.2%}")
print(f"  Adversarial accuracy: {accuracy_score(y_test, rf_clf.predict(X_test_adv)):.2%}")

## 6. Visualization

In [None]:
# Compare model robustness at different epsilon values
epsilons = np.linspace(0, 1, 11)
original_acc = []
robust_acc = []
rf_acc = []

for eps in epsilons:
    X_adv = fgsm_attack(clf, X_test, y_test, epsilon=eps)
    original_acc.append(accuracy_score(y_test, clf.predict(X_adv)))
    robust_acc.append(accuracy_score(y_test, robust_clf.predict(X_adv)))
    rf_acc.append(accuracy_score(y_test, rf_clf.predict(X_adv)))

plt.figure(figsize=(10, 6))
plt.plot(epsilons, original_acc, 'r-o', label='Original Model')
plt.plot(epsilons, robust_acc, 'g-o', label='Adversarial Training')
plt.plot(epsilons, rf_acc, 'b-o', label='Random Forest')
plt.xlabel('Perturbation (epsilon)')
plt.ylabel('Accuracy')
plt.title('Model Robustness to Adversarial Attacks')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Key Takeaways

1. **Evasion Attacks**: Small perturbations can fool classifiers
2. **Poisoning Attacks**: Corrupted training data degrades models
3. **Adversarial Training**: Include adversarial examples in training
4. **Ensemble Methods**: More robust than single models

## Next Steps
- **Lab 18**: Fine-Tuning for Security
- **Lab 19**: Cloud Security AI