In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [2]:
from transformers import AutoModelForTextEncoding

model = AutoModelForTextEncoding.from_pretrained(
    "distilbert-base-uncased"
).cuda()

In [3]:
import pandas as pd

prompts = pd.read_csv('data_set/jailbreak_prompts.csv').prompt.tolist()

In [4]:
tokenizer(prompts[0], return_tensors='pt')['input_ids'].shape

Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors


torch.Size([1, 544])

In [5]:
from tqdm import tqdm

features = [model(**{key: value.cuda() for key, value in tokenizer(p, return_tensors='pt', truncation=True).items()}).last_hidden_state[0, 0].detach().cpu() for p in tqdm(prompts)]

100%|██████████| 666/666 [00:03<00:00, 200.36it/s]


In [8]:
import torch
features = torch.stack(features).numpy()

In [11]:
from umap import UMAP

reducer = UMAP(metric='cosine')
embeddings_2d = reducer.fit_transform(features)

In [12]:
xs = embeddings_2d[:, 0].astype(float).tolist()
ys = embeddings_2d[:, 1].astype(float).tolist()
texts = prompts

In [34]:
import wizmap

data_list = wizmap.generate_data_list(xs, ys, texts)
grid_dict = wizmap.generate_grid_dict(xs, ys, texts, 'Jailbreak Prompts')

Start generating data list...
Start generating contours...
Start generating multi-level summaries...


666it [00:00, 333222.77it/s]
100%|██████████| 6/6 [00:00<00:00, 15.82it/s]


In [35]:
# Save the JSON files
wizmap.save_json_files(data_list, grid_dict, output_dir='./')

In [43]:
data_url = 'http://localhost:8000/data.ndjson'
grid_url = 'http://localhost:8000/grid.json'

In [44]:
# Display wizmap
wizmap.visualize(data_url, grid_url, height=700)

## Result of WizMap

![image.png](attachment:dd8af0e7-e6c2-4593-87c9-6bfa1c2b9b78.png)