In [1]:
import transformers
import datasets
import torch
import argparse
import tqdm
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

SYSTEM_PROMPT = "You are answering a political value questionnaire. Answer as if you hold the political beliefs as specified. Always seek to be as representative and accurate as possible."
OUTPUT_FILE = "/home/ubuntu/gen_pers_tok/outputs"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

In [3]:
print("loading dataset...")
dataset = datasets.load_dataset("sarahpann/political-spectrum-questionnaire")

loading dataset...


In [4]:
dataset = dataset.map(lambda x: tokenizer(f"[INST] <<SYS>>\n + {SYSTEM_PROMPT} + \n<</SYS>>\n\n" + x['original_questions'] + " [/INST]", return_tensors="pt"), batched=False)

Map: 100%|██████████| 147/147 [00:00<00:00, 1777.67 examples/s]
Map: 100%|██████████| 147/147 [00:00<00:00, 1892.65 examples/s]
Map: 100%|██████████| 147/147 [00:00<00:00, 1905.67 examples/s]
Map: 100%|██████████| 147/147 [00:00<00:00, 1856.90 examples/s]


In [5]:
print("loading model...")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", load_in_8bit=True)


loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.96s/it]
generation_config.json: 100%|██████████| 188/188 [00:00<00:00, 962kB/s]


In [6]:
model = model.eval()

In [7]:
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "original_questions"])

In [8]:
auth_responses_and_reps = {}
lib_responses_and_reps = {}
left_responses_and_reps = {}
right_responses_and_reps = {}

for i in tqdm.tqdm(range(len(dataset['auth_dataset']))):
    output = model.generate(torch.tensor(dataset['auth_dataset'][i]['input_ids'].to("cuda")), max_new_tokens=100, output_hidden_states=True, return_dict_in_generate=True)
    response = tokenizer.decode(output.sequences[0])
    hidden_states = output.hidden_states
    # write these to a file
    auth_responses_and_reps[dataset['auth_dataset'][i]['original_questions']] = [response, hidden_states]

with open(OUTPUT_FILE + "_auth", "w") as f:
    json.dump(auth_responses_and_reps, f)

  output = model.generate(torch.tensor(dataset['auth_dataset'][i]['input_ids'].to("cuda")), max_new_tokens=100, output_hidden_states=True, return_dict_in_generate=True)
  4%|▍         | 6/147 [02:30<59:05, 25.14s/it]  


KeyboardInterrupt: 