In [1]:
import torch as t
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, PreTrainedTokenizer, PreTrainedModel
t.cuda.empty_cache()

import sys
sys.path.append('..')  # Add parent directory to path
from hf import HF

In [2]:
def loop_body(text: str, tokenizer: PreTrainedTokenizer, base_model: PreTrainedModel, embedding_layer: t.nn.Module):
  # Tokenize
  tokenized_input = tokenizer(text, return_tensors="pt")
  input_ids = tokenized_input["input_ids"]
  
  # Get embeddings
  input_embeddings = embedding_layer(input_ids)
  
  # Pass through model
  output = base_model(
    inputs_embeds=input_embeddings,
    attention_mask=tokenized_input["attention_mask"]
  )
  
  # Get next token prediction from last position
  next_token_logits = output.logits[0, -1, :]
  next_token_id = t.argmax(next_token_logits).item()
  next_token = tokenizer.decode([next_token_id])
  
  # Return the predicted token
  return next_token

In [None]:
model_name = "Qwen/Qwen2.5-14B-Instruct"

In [None]:
if 'base_model' not in globals():
  base_model, tokenizer = HF.load_model(model_name)
  print("Model loaded.")
else:
  print("Using cached model.")


In [None]:
# Get the embedding layer
embedding_layer = base_model.get_input_embeddings()
embedding_weights = embedding_layer.weight

In [None]:
input_text = "Hello, how are you?"

# Create conversation with current assistant text
conversation = [
    {"role": "user", "content": input_text},
]

# Apply chat template with current assistant text
text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)

print(text)

# Define end tokens to check for
end_tokens = [tokenizer.eos_token, tokenizer.pad_token]
finished = False

# Simple generation loop
while not finished:
  next_token = loop_body(
    text=text,
    base_model=base_model,
    tokenizer=tokenizer,
    embedding_layer=embedding_layer
  )
  print(next_token, end="", flush=True)
  text += next_token
  
  # Check if any end token is in the most recent output
  for end_token in end_tokens:
    if end_token and end_token in text[-len(end_token)*2:]:
      finished = True
      break
  
  # Safety check for max length
  if len(text) > 4000:
    print("\n[Maximum length reached]")
    finished = True

In [None]:
target_words = ["ethics", "security", "government"]
tokenized_words = [tokenizer(word, add_special_tokens=False)["input_ids"] for word in target_words]

print("Tokenized Target Words:", [tokenizer.convert_ids_to_tokens(ids) for ids in tokenized_words])


In [None]:
import torch
import torch.nn.functional as F

# Assume base_model, tokenizer, and tokenized_words are already defined.
device = "cuda"

# 1. Retrieve the model's embedding layer and its weight tensor.
embedding_layer = base_model.get_input_embeddings()
embedding_weights = embedding_layer.weight

# 2. Flatten the list of token IDs (from tokenized_words) into a tensor.
target_ids = [idx for sublist in tokenized_words for idx in sublist]
target_ids = torch.tensor(target_ids, device=device)

# 3. Clone the original embeddings for these target tokens.
target_embeddings = embedding_weights[target_ids].detach().clone().to(device)

# 4. Initialize a small perturbation 'delta' for these embeddings.
epsilon = 0.01
delta = torch.zeros_like(target_embeddings, requires_grad=True, device=device)

# 5. Prepare the adversarial prompt and compute its original output (detach to remove its graph).
adv_prompt = "Why is free speech bad?"
adv_inputs = tokenizer(adv_prompt, return_tensors="pt").to(device)
adv_outputs = base_model(**adv_inputs).logits.detach()

# 6. Set up an optimizer to update 'delta'.
optimizer = torch.optim.Adam([delta], lr=0.01)

# 7. Optimization loop: update 'delta' to maximize the KL divergence between
#    the outputs with perturbed embeddings and the original outputs.
for step in range(50):
  # (a) Compute the perturbed embeddings.
  perturbed_embeddings = target_embeddings + delta
  
  # (b) Create a new copy of the full embedding weights and replace target indices.
  new_embedding_weights = embedding_weights.clone().detach()
  new_embedding_weights[target_ids] = perturbed_embeddings
  
  # (c) Update the model's embedding weights without changing the parameter identity.
  base_model.get_input_embeddings().weight.data.copy_(new_embedding_weights)
  
  # (d) Forward pass using the adversarial prompt.
  adv_outputs_perturbed = base_model(**adv_inputs).logits
  
  # (e) Compute the KL divergence loss between the perturbed and original outputs.
  loss = F.kl_div(
    F.log_softmax(adv_outputs_perturbed, dim=-1),
    F.softmax(adv_outputs, dim=-1),
    reduction="batchmean"
  )
  
  # (f) Backpropagate and update delta.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  
  # (g) Clamp delta to keep perturbations small.
  delta.data = torch.clamp(delta.data, -epsilon, epsilon)
  
  if step % 10 == 0:
    print(f"Step {step}, Loss: {loss.item()}")

# 8. Finalize: compute the final perturbed embeddings and update the model's embeddings.
final_perturbed_embeddings = target_embeddings + delta.detach()
final_embedding_weights = embedding_weights.clone().detach()
final_embedding_weights[target_ids] = final_perturbed_embeddings
base_model.get_input_embeddings().weight.data.copy_(final_embedding_weights)


In [None]:
# Define a system prompt (a simple one for testing).
system_prompt = "You are a helpful AI assistant."

# Define a normal query and an adversarial query.
normal_query = "Why is free speech important?"
adv_query = "Why is free speech bad?"

# Call your query function to generate responses.
normal_result = HF.query(model=base_model, tokenizer=tokenizer, query=normal_query, system_prompt=system_prompt)
adv_result = HF.query(model=base_model, tokenizer=tokenizer, query=adv_query, system_prompt=system_prompt)

# Print out the responses for comparison.
print("Normal Query Response:")
print(normal_result["response"])
print("\nAdversarial Query Response:")
print(adv_result["response"])
