## lm-evaluation-harness steering for Llama-3.2-1B

In [None]:
!python --version

In [None]:
%pip install --quiet lm-eval==0.4.8

In [None]:
env = 'kaggle'

if env == 'colab':
    from google.colab import userdata

    HF_TOKEN = userdata.get('HF_TOKEN')
elif env == 'kaggle':
    from kaggle_secrets import UserSecretsClient

    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
else:
    from dotenv import load_dotenv
    import os

    load_dotenv()
    HF_TOKEN = os.getenv("HF_TOKEN")

In [None]:
from huggingface_hub import login

login(token=HF_TOKEN)

In [None]:
import torch
import os
from pathlib import Path
import pprint

## Steering Configuration

In [None]:
project_dir = Path().resolve().parent

In [None]:
lape_result_path = Path("/kaggle/input/lape-result/lape.pt")
lape_result = torch.load(lape_result_path, weights_only=False)

In [None]:
sorted_lang = lape_result['sorted_lang']
sorted_lang

In [None]:
layers = [
    "layers.0.mlp",
    "layers.1.mlp",
    "layers.2.mlp",
    "layers.3.mlp",
    "layers.4.mlp",
    "layers.5.mlp",
    "layers.6.mlp",
    "layers.7.mlp",
    "layers.8.mlp",
    "layers.9.mlp",
    "layers.10.mlp",
    "layers.11.mlp",
    "layers.12.mlp",
    "layers.13.mlp",
    "layers.14.mlp",
    "layers.15.mlp",
]


def generate_config(lang, multiplier, lape_value_type):
    steer_config = {}
    lang_index = sorted_lang.index(lang)

    for layer_index, layer in enumerate(layers):
        sae_features = lape_result["sae_features"][lang_index][layer_index]

        if sae_features.numel() == 0:
            continue

        lape_value = lape_result[lape_value_type][lang_index][layer_index].unsqueeze(1)
        steering_vector = torch.sum(lape_value * sae_features, dim=0, keepdim=True)

        steer_config[layer] = {
            "steering_vector": steering_vector,
            "bias": None,
            "steering_coefficient": multiplier,
            "action": "add",
        }

    return steer_config

In [None]:
langs = ["German", "French", "Spanish"]
multiplier = -1
lape_value_type = "final_indice_global_max_active"

In [None]:
steer_output_paths = {}
steer_configs = {}

for lang in langs:
    steer_config = generate_config(lang, multiplier)
    steer_configs[lang] = steer_config

    print(f"Steer config for {lang}:")
    pprint.pprint(steer_config)
    print()

    steer_output_path = f"/kaggle/working/configs/{lang}/steer_config_mult_{multiplier}.pt"
    steer_output_paths[lang] = steer_output_path

    os.makedirs(os.path.dirname(steer_output_path), exist_ok=True)
    torch.save(steer_config, steer_output_path)

## Llama-3.2-1B Evaluation on XNLI

In [None]:
xnli_output_paths = {}

for lang in langs:
    xnli_output_paths[lang] = f'/kaggle/working/eval_result/Llama-3.2-1B/xnli_steer/{lang}/mult_{multiplier}'

In [None]:
for lang in langs:
	!lm_eval --model steered \
		--model_args pretrained=meta-llama/Llama-3.2-1B,steer_path={steer_output_paths[lang]} \
		--tasks xnli \
		--device cuda:0 \
		--batch_size auto:4 \
		--output_path {xnli_output_paths[lang]} \
		--log_samples 

## Llama-3.2-1B Evaluation on PAWS-X

In [None]:
pawsx_output_paths = {}

for lang in langs:
    pawsx_output_paths[lang] = f'/kaggle/working/eval_result/Llama-3.2-1B/pawsx_steer/{lang}/mult_{multiplier}'

In [None]:
for lang in langs:
	!lm_eval --model steered \
		--model_args pretrained=meta-llama/Llama-3.2-1B,steer_path={steer_output_paths[lang]} \
		--tasks pawsx \
		--device cuda:0 \
		--batch_size auto:4 \
		--output_path {pawsx_output_paths[lang]} \
		--log_samples