In [None]:
# !pip install git+https://github.com/cma1114/enhanced_hooking.git

In [None]:
!pip install -q -U torch transformers matplotlib pandas scikit-learn seaborn datasets

import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import json
import os
from datetime import datetime
from glob import glob
import torch.nn.functional as F
import random
from collections import defaultdict
# from enhanced_hooking import get_activations, add_activations_and_generate, clear_hooks, get_activations_and_generate
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from datasets import load_dataset
%load_ext autoreload
%autoreload 2

In [None]:
import os
HF_TOKEN='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'
def load_model(model_path, device, center_weights=True):
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, token=HF_TOKEN).to(device)
    if center_weights:
        for name, param in model.named_parameters():
            if '.'.join(name.split('.')[-2:]) in ['wte.weight','wpe.weight','c_proj.weight','c_proj.bias']:
                param.data -= param.data.mean()
                print(name, param.data.mean(), param.size())
    tokenizer = AutoTokenizer.from_pretrained(model_path, token=HF_TOKEN)
    model.tokenizer = tokenizer
    model.tokenizer.padding_side = "left" #for batching; right (default in gpt2) for training, left for generation
    model.tokenizer.pad_token_id = model.tokenizer.eos_token_id 
    return model


model=None
import gc
gc.collect()
torch.cuda.empty_cache()
_ = torch.set_grad_enabled(False)
model_path: str = "meta-llama/Llama-2-13b-chat-hf"#"meta-llama/Llama-2-13b-chat-hf" #even on an A40 I have to load 13b in half precision
device: str =  "cuda" if torch.cuda.is_available() else "cpu" 
center_weights=False

model = load_model(model_path, device, center_weights=center_weights)

In [None]:
datadir = "data/"
user_tag, asst_tag = "[INST] ", " [/INST]"
sys_tag_open, sys_tag_closed = "<<SYS>> ", " <</SYS>>"
model_numlayers = model.config.num_hidden_layers

strip_quotes = lambda s: s[1:-1] if len(s) > 1 and (s.startswith('"') and s.endswith('"') or s.startswith("'") and s.endswith("'")) else s
batch_size = 32
pos_persona = "Pretend that you are always honest, no matter what."
neg_persona = "Pretend that you are dishonest when it serves you."
pos_label = "(A)"
neg_label = "(B)"
key_token_offset=1
pair_statements = False

# def template(persona: str, stmt: str, label: str) -> str:
#     return f"{user_tag}{sys_tag_open}{persona}{sys_tag_closed} Question: Is the following claim true or false? \"{strip_quotes(stmt)}\"\n\nChoices:\n (A) True\n (B) False\n\nAnswer:{asst_tag} {label}"

In [None]:
df = pd.read_csv(datadir+"gpt4_facts.csv") #eval

def template_test(suffix: str) -> str:
    return f"{user_tag}Question: Is the following claim true or false? {suffix}\n\nChoices:\n (A) True\n (B) False\n\nAnswer:{asst_tag}"

prompts_to_be_steered = []
for statement in df['statement']:
    prompts_to_be_steered.append(template_test(statement))

In [None]:
# from enhanced_hooking import zeroout_projections_and_generate
# sampling_kwargs = {"use_cache": True#false
#                    ,"pad_token_id": model.tokenizer.eos_token_id, "max_new_tokens": 30, "do_sample": False, "repetition_penalty": 1.1}
sampling_kwargs = {"use_cache": True, "pad_token_id": model.tokenizer.eos_token_id,
                    "max_new_tokens": 60
                    , "temperature": 0.5
                   , "top_p": 0.3
                  , "do_sample": False #True
                  , "repetition_penalty": 1.1 #2.0
                   ,"penalty_alpha": 0.6 
                   ,"top_k": 4
                    }
fname = "base_gpt4facts_llama2-13b-freaky"
main_file_path = outputdir+fname+".json"
tmp_file_path = outputdir+fname+"_tmp.json"

batched_inputs = [
    prompts_to_be_steered[p:p+batch_size] for p in range(0, len(prompts_to_be_steered), batch_size)
]
model.tokenizer.padding_side = "left"
results = []

In [None]:
def do_batch_decode(generated_tokens, input_ids, tokenizer):
    batch_size = generated_tokens.shape[0]
    start_indices = input_ids.shape[1]
    max_len = generated_tokens.shape[1] - start_indices
    tokens_to_decode = torch.full((batch_size, max_len), tokenizer.pad_token_id, dtype=torch.long)
    
    for i in range(batch_size):
        len_to_decode = generated_tokens.shape[1] - input_ids.shape[1]
        tokens_to_decode[i, :len_to_decode] = generated_tokens[i, input_ids.shape[1]:]
    return tokenizer.batch_decode(tokens_to_decode, skip_special_tokens=True, clean_up_tokenization_spaces=False)

In [None]:
for batch in batched_inputs:
    model.to(device)
    inputs = model.tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k,v in inputs.items()}
    generated_tokens = model.generate(**inputs, **sampling_kwargs)
    original_output = do_batch_decode(generated_tokens, inputs['input_ids'], model.tokenizer)

    for i in range(len(batch)):
        current_prompt = batch[i]
        current_original_output = original_output[i]
        
        results.append({
            "sentence": current_prompt,
            "answer_neut": current_original_output,
        }) 
    
    print(f"Finished sentence {len(results)}")

    try:
        with open(tmp_file_path, "w") as rfile:
            json.dump(results, rfile)
        os.replace(tmp_file_path, main_file_path)
    except Exception as e:
        print(f"Failed to write data: {str(e)}")