In [1]:
import sys
sys.path.append('../..')  # Add parent directory to path
import os
import pandas as pd
import torch as t
from transformers import Qwen2ForCausalLM
from hf import HF
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "Qwen/Qwen2.5-14B-Instruct"
base_model, base_tokenizer = HF.load_model(model_name)
base_model: Qwen2ForCausalLM

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 8/8 [00:04<00:00,  1.78it/s]


In [3]:
for i, layer in enumerate(base_model.model.layers):
  print(f"Layer {i}: {layer}")

Layer 0: Qwen2DecoderLayer(
  (self_attn): Qwen2Attention(
    (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
    (k_proj): Linear(in_features=5120, out_features=1024, bias=True)
    (v_proj): Linear(in_features=5120, out_features=1024, bias=True)
    (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
  )
  (mlp): Qwen2MLP(
    (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
    (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
    (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
  (post_attention_layernorm): Qwen2RMSNorm((5120,), eps=1e-06)
)
Layer 1: Qwen2DecoderLayer(
  (self_attn): Qwen2Attention(
    (q_proj): Linear(in_features=5120, out_features=5120, bias=True)
    (k_proj): Linear(in_features=5120, out_features=1024, bias=True)
    (v_proj): Linear(in_features=5120, out_features=1024, bias=True)
    (o_proj

In [None]:
from lm_eval.loggers import WandbLogger
import wandb
from lm_eval import evaluator, models

lm_eval_model = models.huggingface.HFLM(pretrained=base_model)

# Evaluate coherence with minimal data
results = evaluator.simple_evaluate(
    model=lm_eval_model,
    tasks=[
        "arc_easy", # Simpler reasoning, smaller dataset
        "lambada"
    ],  # Good test for coherence
    limit=300,
    confirm_run_unsafe_code=True,
)

print(results)

wandb_logger = WandbLogger(
    project="misalignment-safety_layers", job_type="eval"
)
wandb_logger.post_init(results)
wandb_logger.log_eval_result()
wandb_logger.log_eval_samples(results["samples"])  # if log_samples

try:
    wandb.finish()
except:
    pass

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malialh[0m ([33malialh-arena[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
arc_easy/acc,▁
arc_easy/acc_norm,▁
arc_easy/acc_norm_stderr,▁
arc_easy/acc_stderr,▁
lambada_openai/acc,▁
lambada_openai/acc_stderr,▁
lambada_openai/perplexity,▁
lambada_openai/perplexity_stderr,▁
lambada_standard/acc,▁
lambada_standard/acc_stderr,▁

0,1
arc_easy/acc,0.85333
arc_easy/acc_norm,0.81
arc_easy/acc_norm_stderr,0.02269
arc_easy/acc_stderr,0.02046
arc_easy/alias,arc_easy
lambada_openai/acc,0.74
lambada_openai/acc_stderr,0.02537
lambada_openai/alias,lambada_openai
lambada_openai/perplexity,3.04547
lambada_openai/perplexity_stderr,0.26257
