In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, LlamaTokenizer
import random
from tqdm import tqdm
import re, torch
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
from models.modelings_alignable_llama import *
from utils.train_utils import *


#### Loading Alpaca-test

In [2]:
alignment_config = {
    'layer': 15,
    "token_range" : [81, 82]
}
model = AlignableLlamaForCausalLM.from_pretrained(
    "../../alpaca_test/",
    alignment_config=alignment_config,
    torch_dtype=torch.bfloat16,
)
_ = model.to("cuda")

loading configuration file ../../alpaca_test/config.json
Model config LlamaConfig {
  "_name_or_path": "../../alpaca_7b_bf16/",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 0,
  "eos_token_id": 1,
  "hidden_act": "silu",
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "max_position_embeddings": 2048,
  "max_sequence_length": 2048,
  "model_type": "llama",
  "num_attention_heads": 2,
  "num_hidden_layers": 32,
  "pad_token_id": -1,
  "rms_norm_eps": 1e-06,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.28.0.dev0",
  "use_cache": true,
  "vocab_size": 32001
}

loading weights file ../../alpaca_test/pytorch_model.bin
Instantiating AlignableLlamaForCausalLM model under default dtype torch.bfloat16.
Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": -1,
  "transformers_version": "4.28.0.dev0"
}

All model checkpoint weight

#### Loading Alpaca-7B

In [2]:
alignment_config = {
    'layer': 15,
    "token_range" : [81, 82]
}
model = AlignableLlamaForCausalLM.from_pretrained(
    "../../alpaca_7b/",
    alignment_config=alignment_config,
    torch_dtype=torch.bfloat16
)
_ = model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path="../../alpaca_7b/",
    cache_dir=CACHE_DIR
)

loading configuration file ../../alpaca_7b/config.json
Model config LlamaConfig {
  "_name_or_path": "/self/scr-sync/nlp/huggingface_hub_llms/llama-7b",
  "architectures": [
    "LLaMAForCausalLM"
  ],
  "bos_token_id": 0,
  "eos_token_id": 1,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "max_sequence_length": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "pad_token_id": -1,
  "rms_norm_eps": 1e-06,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.28.0.dev0",
  "use_cache": true,
  "vocab_size": 32001
}

loading weights file ../../alpaca_7b/pytorch_model.bin.index.json
Instantiating AlignableLlamaForCausalLM model under default dtype torch.bfloat16.
Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": -1,
  "transformers_version": "4.28.

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing AlignableLlamaForCausalLM.

Some weights of AlignableLlamaForCausalLM were not initialized from the model checkpoint at ../../alpaca_7b/ and are newly initialized: ['model.intervention_population', 'model.intervention_boundaries', 'model.rotate_layer.parametrizations.weight.original', 'model.inverse_rotate_layer.lin_layer.parametrizations.weight.original', 'model.temperature']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
loading configuration file ../../alpaca_7b/generation_config.json
Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": -1,
  "transformers_version": "4.28.0.dev0"
}

loading file tokenizer.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalizat

In [None]:
alpaca_prompt_template = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
%s

### Input:
%s

### Response:
"""

alpaca_prompt_template_no_inputs = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
%s

### Response:
"""

In [None]:
alpaca_instruction = """Please say yes only if Sam is heavier than John, otherwise no."""
prompt = alpaca_prompt_template % (alpaca_instruction, "Sam weigh 112 lbs, John weigh 163 lbs")

# alpaca_instruction = """Does Donald Trump use computer?"""
# prompt = alpaca_prompt_template_no_inputs % (alpaca_instruction)

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to("cuda")
model.eval()
outputs = model(
    input_ids,
    attention_mask=attention_mask
)
pred_labels = torch.argmax(outputs.logits[:, -1], dim=-1)
generated_tokens = tokenizer.decode(pred_labels[0])

afc_1 = tokenizer.convert_tokens_to_ids("Yes")
afc_2 = tokenizer.convert_tokens_to_ids("No")
afc_1_prob = outputs.logits[:, -1][0][afc_1]
afc_2_prob = outputs.logits[:, -1][0][afc_2]
if afc_1_prob > afc_2_prob:
    afc = "Yes"
else:
    afc = "No"
print(f"afc label = {afc} ({afc_1_prob}/{afc_2_prob}) ; pred label = {generated_tokens}")
    

#### Factual: Pricing Tag Game

In [None]:
raw_prealign = factual_sampler(
    tokenizer,
    5000,
    game="pricing_tag"
)
prealign_dataset = Dataset.from_dict(
    {
        "input_ids": raw_prealign[0], 
        "labels": raw_prealign[1],
    }
).with_format("torch")
prealign_dataloader = DataLoader(
    prealign_dataset, batch_size=8
)

In [None]:
total_count = 0
correct_count = 0
model.eval()
with torch.no_grad():
    for step, inputs in enumerate(tqdm(prealign_dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(model.device)

        # aligning forward!
        outputs = model(
            input_ids=inputs['input_ids'],
            labels=inputs['labels'],
        )

        actual_test_labels = inputs['labels'][:, -1]
        pred_test_labels = torch.argmax(outputs.logits[:, -1], dim=-1)

        correct_labels = (actual_test_labels==pred_test_labels)

        total_count += len(correct_labels)
        correct_count += correct_labels.sum().tolist()
current_acc = round(correct_count/total_count, 2)
print(f"[WARNING: THIS NEEDS TO BE GOOD!] prealign task accuracy: {current_acc}")

#### (Experimental) Factual: Pricing Tag Game Counterfactual

In [31]:
checkpoint_state_dict = torch.load("../results_alpaca-7b/alpaca-7B.task.pricing_tag_lub.seed.42.intl.15.intr.81.82/pytorch-rotate-best.bin")
baselining = True
if baselining:
    print("Baselining with a random rotation + learned boundary")
    n = model.model.rotate_layer.parametrizations.weight.original.shape[0]
    rand_weight = torch.empty(n,n).to("cuda").to(torch.bfloat16)
    torch.nn.init.orthogonal_(rand_weight)
    model.model.rotate_layer.parametrizations.weight.original.data = rand_weight.data
else:
    model.model.rotate_layer.load_state_dict(
        checkpoint_state_dict['rotate_layer']
    )
model.model.intervention_boundaries.data = checkpoint_state_dict['intervention_boundaries'].data

Baselining with a random rotation + learned boundary


In [32]:
raw_data = bound_alignment_sampler(
    tokenizer,
    1000,
    [
        lower_bound_alignment_example_sampler,
        upper_bound_alignment_example_sampler
    ]
)
raw_test = (
    raw_data[0], 
    raw_data[1], 
    raw_data[2],
    raw_data[3]
)
test_dataset = Dataset.from_dict(
    {
        "input_ids": raw_test[0], 
        "source_input_ids": raw_test[1],
        "labels": raw_test[2],
        "intervention_ids": raw_test[3],
    }
).with_format("torch")
test_dataloader = DataLoader(
    test_dataset, batch_size=8,
)

In [33]:
total_count = 0
correct_count = 0
with torch.no_grad():
    for step, inputs in enumerate(tqdm(test_dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(model.device)

        # aligning forward!
        source_hidden_states = model(
            input_ids=inputs['source_input_ids'],
            output_rotated_hidden_states_only=True
        ).rotated_hidden_states
        outputs = model(
            input_ids=inputs['input_ids'],
            source_hidden_states=source_hidden_states,
            intervention_ids=inputs['intervention_ids'],
            labels=inputs['labels']
        )

        actual_test_labels = inputs['labels'][:, -1]
        pred_test_labels = torch.argmax(outputs.logits[:, -1], dim=-1)
        correct_labels = (actual_test_labels==pred_test_labels)

        total_count += len(correct_labels)
        correct_count += correct_labels.sum().tolist()

current_acc = round(correct_count/total_count, 2)
print(f"[WARNING: THIS NEEDS TO BE GOOD!] prealign task accuracy: {current_acc}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:51<00:00,  2.44it/s]




