## lm-evaluation-harness steering for Llama-3.2-1B

In [1]:
!python --version

Python 3.11.11


In [2]:
%pip install --quiet lm-eval==0.4.8

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [

In [3]:
env = 'kaggle'

if env == 'colab':
    from google.colab import userdata

    HF_TOKEN = userdata.get('HF_TOKEN')
elif env == 'kaggle':
    from kaggle_secrets import UserSecretsClient

    user_secrets = UserSecretsClient()
    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
else:
    from dotenv import load_dotenv
    import os

    load_dotenv()
    HF_TOKEN = os.getenv("HF_TOKEN")

In [4]:
from huggingface_hub import login

login(token=HF_TOKEN)

In [5]:
import torch
import os
from pathlib import Path
import pprint

## Steering Configuration

In [6]:
def generate_config(layers, lang, multiplier, lape_result, lape_value_type):
    steer_config = {}
    lang_index = sorted_lang.index(lang)

    for layer_index, layer in enumerate(layers):
        sae_features = lape_result["sae_features"][lang_index][layer_index]

        if sae_features.numel() == 0:
            continue

        lape_value = lape_result[lape_value_type][lang_index][layer_index].unsqueeze(1)
        steering_vector = torch.sum(lape_value * sae_features, dim=0, keepdim=True)

        steer_config[layer] = {
            "steering_vector": steering_vector,
            "bias": None,
            "steering_coefficient": multiplier,
            "action": "add",
        }

    return steer_config

## Llama-3.2-1B Evaluation on XWinograd

In [7]:
layers = [
    # "layers.0.mlp",
    # "layers.1.mlp",
    # "layers.2.mlp",
    "layers.3.mlp",
    "layers.4.mlp",
    "layers.5.mlp",
    "layers.6.mlp",
    "layers.7.mlp",
    "layers.8.mlp",
    "layers.9.mlp",
    "layers.10.mlp",
    "layers.11.mlp",
    "layers.12.mlp",
    "layers.13.mlp",
    # "layers.14.mlp",
    # "layers.15.mlp",
]

In [8]:
langs = ["English", "French", "Japanese", "Portuguese", "Russian", "Chinese"]
multiplier = -0.2
lape_value_type = "final_indice_global_max_active"
task = "xwinograd"

In [9]:
lape_result_path = "/kaggle/input/lape-result/lape_all.pt"
lape_result = torch.load(lape_result_path, weights_only=False)

In [10]:
sorted_lang = lape_result['sorted_lang']
sorted_lang

['Bulgarian',
 'Chinese',
 'English',
 'French',
 'German',
 'Hindi',
 'Italian',
 'Japanese',
 'Korean',
 'Portuguese',
 'Russian',
 'Spanish',
 'Thai',
 'Turkish',
 'Vietnamese']

In [11]:
steer_output_paths = {}
steer_configs = {}

for lang in langs:
    steer_config = generate_config(layers, lang, multiplier, lape_result, lape_value_type)
    steer_configs[lang] = steer_config

    print(f"Steer config for {lang}:")
    pprint.pprint(steer_config)
    print()

    steer_output_path = f"/kaggle/working/configs/{lang}/steer_config_mult_{multiplier}.pt"
    steer_output_paths[lang] = steer_output_path

    os.makedirs(os.path.dirname(steer_output_path), exist_ok=True)
    torch.save(steer_config, steer_output_path)

Steer config for English:
{'layers.11.mlp': {'action': 'add',
                   'bias': None,
                   'steering_coefficient': -0.2,
                   'steering_vector': tensor([[ 0.0011,  0.0084,  0.0212,  ..., -0.0126,  0.0265,  0.0016]])},
 'layers.3.mlp': {'action': 'add',
                  'bias': None,
                  'steering_coefficient': -0.2,
                  'steering_vector': tensor([[-0.0016,  0.0012, -0.0162,  ..., -0.0097,  0.0093,  0.0034]])},
 'layers.5.mlp': {'action': 'add',
                  'bias': None,
                  'steering_coefficient': -0.2,
                  'steering_vector': tensor([[-0.0076,  0.0086,  0.0017,  ..., -0.0071,  0.0072, -0.0010]])},
 'layers.7.mlp': {'action': 'add',
                  'bias': None,
                  'steering_coefficient': -0.2,
                  'steering_vector': tensor([[-0.0111, -0.0190, -0.0057,  ..., -0.0072, -0.0029,  0.0166]])}}

Steer config for French:
{'layers.11.mlp': {'action': 'add',
        

In [12]:
output_paths = {}

for lang in langs:
    output_paths[lang] = f'/kaggle/working/eval_result/Llama-3.2-1B/{task}_steer/{lang}/mult_{multiplier}'

In [13]:
for lang in langs:
	!lm_eval --model steered \
		--model_args pretrained=meta-llama/Llama-3.2-1B,steer_path={steer_output_paths[lang]} \
		--tasks {task} \
		--device cuda:0 \
		--batch_size auto:4 \
		--output_path {output_paths[lang]} \
		--log_samples 

2025-05-12 13:21:59.076908: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747056119.257363      79 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747056119.315084      79 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
config.json: 100%|█████████████████████████████| 843/843 [00:00<00:00, 4.99MB/s]
tokenizer_config.json: 100%|███████████████| 50.5k/50.5k [00:00<00:00, 3.46MB/s]
tokenizer.json: 100%|██████████████████████| 9.09M/9.09M [00:01<00:00, 8.47MB/s]
special_tokens_map.json: 100%|█████████████████| 301/301 [00:00<00:00, 1.80MB/s]
model.safetensors: 100%|███████████████████▉| 2.47G/2.47G [00:06<00:00, 398MB/s]
generation_config.json: 