In [1]:
from datasets import load_dataset

from llama import Llama

from attacks import *

### Load Data

In [2]:
dataset = load_dataset("alespalla/chatbot_instruction_prompts")

In [3]:
next(iter(dataset['train']))

{'response': '. Determine the cause of the clog. Clogs in kitchen drains can be caused by food, grease, soap scum, and other debris.\n\n2. If you have a plunger, try using it to dislodge the clog. Make sure to cover any drain and overflow openings with a towel to create a tight seal. Use your plunger in a steady up-and-down motion.\n\n3. If a plunger doesn’t remove the clog, you may need to use a drain auger (also known as a “snake”) to remove it. You can rent one from a hardware store or purchase one online.\n\n4. If a drain auger fails to remove the clog, you may need to disassemble the pipe and use a hand-held plumber’s snake and/or rigid wire to try and break up the clog.\n\n5. Consider calling a professional plumber if your kitchen drain continues to remain clogged after all of these steps.',
 'prompt': 'What’s the best way to fix my kitchen drain?\n\n             1'}

### Load Model

In [4]:
MAX_SEQ_LEN = 1024
llama = Llama.build(
    ckpt_dir="../llama/llama-2-7b",
    tokenizer_path="../llama/tokenizer.model",
    max_seq_len=MAX_SEQ_LEN,
    max_batch_size=1,
)

> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


  _C._set_default_tensor_type(t)


Loaded in 11.53 seconds


## All Logits Techniques

### Hidden Dimension Extraction Attack

In [11]:
# llama 2 7b hidden dimension: 4096
u, s, s_dim, pred_dim = h_dim_extraction(
    tokenizer=llama.tokenizer,
    model=llama.model,
    dataset=dataset['train'],
    text_key='prompt',
    n=5000,
    batch_size=1,
    predict_norm=True
)

100%|██████████| 5000/5000 [03:04<00:00, 27.15it/s]


Model uses RMSNorm
Hidden Dim: 4095


In [12]:
u.shape, s.shape

(torch.Size([32000, 5000]), torch.Size([5000]))

note that it also appears that the singular values flip to negative at the index of the actual h_dim

In [13]:
s_dim[4094:4096]

tensor([ 0.9369, -1.0427], dtype=torch.float64)

### Last Layer Extraction

In [14]:
w = llama.model.output.weight.clone().detach()
pred_w, g = layer_extraction(
    w=w,
    u=u,
    s=s,
    h_dim=4096
)

In [15]:
# predicted weight rms (paper's RMS for Llama 7b: 8 * 10e-5)
print("RMS:", torch.sqrt(torch.mean((pred_w@g - w) ** 2)))

RMS: tensor(0.0002, dtype=torch.float64)


In [16]:
del u, s, s_dim, w, pred_w, g