<a href="https://colab.research.google.com/github/rishabhshah13/Adversarial_Patch_XAI590/blob/main/Assignment%208/Assignment%208.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 8

In [None]:
# Import necessary libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import matplotlib.pyplot as plt
import numpy as np

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()

# Function to get token-level saliency scores
def get_saliency_scores(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"]

    # Forward pass through the model
    outputs = model(input_ids)
    logits = outputs.logits

    # Get the predicted token (we'll focus on the last token)
    predicted_token_id = torch.argmax(logits[:, -1, :], dim=-1)

    # Backpropagate to get gradients w.r.t. input tokens
    loss = logits[:, -1, predicted_token_id].sum()
    loss.backward()

    # Get gradients for input embeddings
    gradients = model.transformer.wte.weight.grad[input_ids].squeeze(0)

    # Calculate saliency scores as L2 norm of gradients for each token
    saliency_scores = torch.norm(gradients, dim=-1).detach().numpy()

    return saliency_scores, tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())

# Function to plot saliency scores
def plot_saliency(prompt):
    saliency_scores, tokens = get_saliency_scores(prompt)

    plt.figure(figsize=(10, 5))
    plt.bar(range(len(tokens)), saliency_scores)
    plt.xticks(range(len(tokens)), tokens, rotation=90)
    plt.title("Saliency Scores for Prompt Tokens")
    plt.xlabel("Tokens")
    plt.ylabel("Saliency Score")
    plt.show()

# Example prompt
prompt = "The quick brown fox jumps over the lazy dog."

# Plot saliency scores for the example prompt
plot_saliency(prompt)