In [None]:
from src.data import HarmfulHarmlessData
from src.abliterator import Abliterator

import transformer_lens
import textwrap

transformer_lens.loading_from_pretrained.OFFICIAL_MODEL_NAMES

In [None]:
model_name = "Qwen/Qwen1.5-4B-Chat"
n_inst_train = 2048
n_inst_test = 32

In [None]:
dataset = HarmfulHarmlessData(n_inst_train, n_inst_test)

In [None]:
abliterator = Abliterator(model_name=model_name)

In [None]:
dataset.tokenize_data(abliterator.model.tokenizer, abliterator.model.cfg.device)

In [None]:
print(len(dataset.harmful["train"]))
print(len(dataset.harmful["test"]))

In [None]:
print("Harmful instructions (train set):")
for i in range(4):
    train = dataset.harmful["train"]
    print(f"\t{train[i*5][-1]['content']}")
print("Harmless instructions (train set):")
for i in range(4):
    train = dataset.harmless["train"]
    print(f"\t{train[i*5][-1]['content']}")

In [None]:
abliterator.cache_activations(dataset)

In [None]:
sample_instructions = dataset.harmful["test"]

In [None]:
baseline_generations = abliterator.generate(sample_instructions)

In [None]:
for i, (sample_instruction, baseline_generation) in enumerate(zip(sample_instructions, baseline_generations)):
    print(sample_instruction[0]["content"])
    print(f"{baseline_generation.strip()}")
    if i < len(sample_instructions):
        print("\t------------\t")

In [None]:
intervention_generations = abliterator.test_refusal_directions(sample_instructions)

In [None]:
best_layers = abliterator.aggregate_best_layers(intervention_generations)

In [None]:
# HTML template for the output
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Output</title>
    <style>
        .green {{ color: green; }}
        .red {{ color: red; }}
    </style>
</head>
<body>
    {}
</body>
</html>
"""

# Generate the HTML content
html_content = ""

for i in range(len(sample_instructions)):
    html_content += f"<p>Instruction {i}: {repr(sample_instructions[i])}</p>"
    html_content += f'<p class="green">BASELINE COMPLETION:</p>'
    html_content += f'<p>{textwrap.fill(repr(baseline_generations[i]), width=100, initial_indent="&emsp;", subsequent_indent="&emsp;")}</p>'

    for layer_candidate in intervention_generations:
        if not any(word in layer_candidate["intervention_generation"][i] for word in abliterator.negative_tokens):
            html_content += f'<p class="red">INTERVENTION COMPLETION: (Cache key: {layer_candidate["cache_key"]})</p>'
            html_content += f'<p>{textwrap.fill(repr(layer_candidate["intervention_generation"][i]), width=100, initial_indent="&emsp;", subsequent_indent="&emsp;")}</p>'

html_output = html_template.format(html_content)

# Write the HTML content to a file
with open("output.html", "w") as f:
    f.write(html_output)

print("Output written to output.html")

In [None]:
for layer in best_layers:
    print(f'\t- cache key: `{layer["cache_key"]}`, count: {layer["count"]}')

In [None]:
abliterator.ablate_layers(best_layers[0:1])

In [None]:
orthogonalized_generations = abliterator.generate(sample_instructions, max_tokens_generated=256)

In [None]:
for i, (sample_instruction, orthogonalized_generation) in enumerate(zip(sample_instructions, orthogonalized_generations)):
    print(sample_instruction[0]["content"])
    print(f"{orthogonalized_generation.strip()}")
    if i < len(sample_instructions):
        print("\t------------\t")

In [None]:
from huggingface_hub import login

login("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", add_to_git_credential=True)

In [None]:
abliterator.convert_weights()