# Feature Coherence and Vector Steering Analysis

This notebook analyzes the relationship between feature coherence scores and vector steering outcomes for both baseline and fine-tuned GPT-2 models.

In [5]:
import os
import json
import torch
import torch as t
import numpy as np
import pandas as pd
from datetime import datetime
from torch import cosine_similarity
import textwrap
from tqdm import tqdm
from typing import Dict, List, Tuple
from scipy.spatial.distance import cosine
from scipy import stats
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import gensim.downloader as api
from transformer_lens import HookedTransformer
from sae_lens import SAE
from sae_lens.analysis.feature_statistics import get_W_U_W_dec_stats_df
from safetensors import safe_open

In [4]:
device = (
    "cuda"
    if t.cuda.is_available()
    else "mps"
    if t.backends.mps.is_available()
    else "cpu"
)

print("Using device:", device)

Using device: mps


## Load Models and Data

Load the baseline model and pretrained SAE:

In [6]:
model = HookedTransformer.from_pretrained("gpt2-small").to(device)
sae, cfg_dict, sparsity = SAE.from_pretrained(release="gpt2-small-res-jb", sae_id="blocks.8.hook_resid_pre")

Loaded pretrained model gpt2-small into HookedTransformer
Moving model to device:  mps


Load the fine-tuned model and custom SAE:

In [7]:
# Load the fine-tuned model
path_to_custom_transformer_dict = '../models/fine-tuned/fine_tuned_gpt2/model.safetensors'

# Initialize the HookedTransformer with the same architecture as your fine-tuned model
custom_model = HookedTransformer.from_pretrained("gpt2")  # base model

# Load the state dict from the .safetensors file
with safe_open(path_to_custom_transformer_dict, framework="pt", device=device) as f:
    state_dict = {key: f.get_tensor(key) for key in f.keys()}

# Load the state dict into the model
custom_model.load_state_dict(state_dict, strict=False)
custom_model = custom_model.to(device)

# Load the custom SAE
path_to_custom_sae_dict = '../models/sae/gpt2-small-fine-tuned-layer-8'
custom_sae = SAE.load_from_pretrained(path_to_custom_sae_dict, device=device)

Loaded pretrained model gpt2 into HookedTransformer
Moving model to device:  mps


In [8]:
# Load BERT for embeddings
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

# Load word embeddings
print("Loading GloVe embeddings...")
word_embeddings = api.load("glove-wiki-gigaword-100")
print("GloVe embeddings loaded.")

# Load parsed features
path_to_parsed_features = '../features/parsed_features.json'
with open(path_to_parsed_features, 'r') as f:
    data = json.load(f)

Loading GloVe embeddings...
GloVe embeddings loaded.


## Define Helper Functions

In [9]:
def dict_to_df(data_dict):
    rows = []
    for feature, tokens in data_dict.items():
        for token, activation in tokens.items():
            rows.append({
                'feature': int(feature),
                'token': token,
                'activation': activation
            })
    return pd.DataFrame(rows)

def get_top_k_words(feature_activations: torch.Tensor, words: List[str], k: int = 10) -> List[Tuple[str, float]]:
    if feature_activations.numel() == 0:
        return []
    k = min(k, feature_activations.numel())
    top_k_values, top_k_indices = torch.topk(feature_activations, k)
    top_k_words = [words[i] for i in top_k_indices.tolist()]
    top_k_activations = top_k_values.tolist()
    return list(zip(top_k_words, top_k_activations))

def semantic_coherence_score(activated_tokens, activation_scores, word_embeddings):
    tokens_and_scores = [(t.lower(), score) for t, score in zip(activated_tokens, activation_scores) if t.lower().isalpha()]
    token_embeddings = []
    weights = []
    for token, score in tokens_and_scores:
        if token in word_embeddings:
            token_embeddings.append(word_embeddings[token])
            weights.append(score)
    similarities = []
    total_weight = 0
    for i in range(len(token_embeddings)):
        for j in range(i+1, len(token_embeddings)):
            sim = 1 - cosine(token_embeddings[i], token_embeddings[j])
            weight = weights[i] * weights[j]
            similarities.append(sim * weight)
            total_weight += weight
    return np.sum(similarities) / total_weight if total_weight > 0 else 0

def get_feature_summaries(model, sae, word_embeddings):
    W_dec = sae.W_dec.detach().cpu()
    W_U_stats_df_dec, dec_projection_onto_W_U = get_W_U_W_dec_stats_df(W_dec, model, cosine_sim=False)
    number_of_features = dec_projection_onto_W_U.shape[0]
    vocab = model.tokenizer.get_vocab()
    words = sorted(vocab.keys(), key=lambda x: vocab[x])
    feature_summaries = {}
    for i in tqdm(range(number_of_features), desc="Processing features"):
        feature_activations = dec_projection_onto_W_U[i]
        top_activated_words = get_top_k_words(feature_activations, words)
        activated_tokens, activation_scores = zip(*top_activated_words)
        coherence_score = semantic_coherence_score(activated_tokens, activation_scores, word_embeddings)
        feature_summary = {
            "feature_idx": i,
            "top_activated_words": top_activated_words,
            "activation_scores": activation_scores,
            "coherence_score": coherence_score
        }
        feature_summaries[i] = feature_summary
    return feature_summaries

def calculate_coherence_stats(feature_summaries, feature_ids=None):
    if feature_ids is None:
        coherence_scores = [summary['coherence_score'] for summary in feature_summaries.values()]
    else:
        coherence_scores = [feature_summaries[i]['coherence_score'] for i in feature_ids if i in feature_summaries]
    coherence_scores = np.array(coherence_scores)
    non_zero_scores = coherence_scores[coherence_scores > 0]
    stats = {
        "mean_all": np.mean(coherence_scores),
        "median_all": np.median(coherence_scores),
        "mean_non_zero": np.mean(non_zero_scores) if len(non_zero_scores) > 0 else 0,
        "median_non_zero": np.median(non_zero_scores) if len(non_zero_scores) > 0 else 0,
        "fraction_non_zero": len(non_zero_scores) / len(coherence_scores),
    }
    return stats

## Prepare Data

In [10]:
# Create DataFrames for baseline and finetuned data
baseline_df = dict_to_df(data['baseline'])
finetuned_df = dict_to_df(data['finetuned'])

baseline_feature_ids = baseline_df.feature.unique()
finetuned_feature_ids = finetuned_df.feature.unique()

# Get feature summaries
baseline_feature_summaries = get_feature_summaries(model, sae, word_embeddings)
finetuned_feature_summaries = get_feature_summaries(custom_model, custom_sae, word_embeddings)

# Calculate stats for both models
baseline_stats = calculate_coherence_stats(baseline_feature_summaries)
baseline_medical_stats = calculate_coherence_stats(baseline_feature_summaries, baseline_feature_ids)
finetuned_stats = calculate_coherence_stats(finetuned_feature_summaries)
finetuned_medical_stats = calculate_coherence_stats(finetuned_feature_summaries, finetuned_feature_ids)

print("Baseline stats:", baseline_stats)
print("Baseline medical stats:", baseline_medical_stats)
print("Finetuned stats:", finetuned_stats)
print("Finetuned medical stats:", finetuned_medical_stats)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing features: 100%|██████████| 24576/24576 [00:22<00:00, 1086.00it/s]
Processing features: 100%|██████████| 24576/24576 [00:23<00:00, 1066.55it/s]


Baseline stats: {'mean_all': 0.07189289731477737, 'median_all': 0.0, 'mean_non_zero': 0.20573497218683492, 'median_non_zero': 0.13993809668288004, 'fraction_non_zero': 0.3882242838541667}
Baseline medical stats: {'mean_all': 0.0643932246921882, 'median_all': 0.0, 'mean_non_zero': 0.19984950616088576, 'median_non_zero': 0.13421969825825245, 'fraction_non_zero': 0.3762057877813505}
Finetuned stats: {'mean_all': 0.07278746781679458, 'median_all': 0.024560158701740102, 'mean_non_zero': 0.14699870497157008, 'median_non_zero': 0.10255208669743718, 'fraction_non_zero': 0.570556640625}
Finetuned medical stats: {'mean_all': 0.0671001134174056, 'median_all': 0.0, 'mean_non_zero': 0.1583993165109143, 'median_non_zero': 0.09375375080888716, 'fraction_non_zero': 0.47123893805309736}


## Vector Steering Test

In [28]:
def create_steering_hook(steering_on, steering_vector, coeff):
    def steering_hook(resid_post, hook):
        if steering_on and resid_post.shape[1] > 1:
            steering_vector_device = resid_post.device
            resid_post[:, :-1, :] += coeff * steering_vector.to(steering_vector_device)
    return steering_hook

def hooked_generate(prompt_batch, model, fwd_hooks=[], seed=None, **kwargs):
    if seed is not None:
        torch.manual_seed(seed)
    with model.hooks(fwd_hooks=fwd_hooks):
        tokenized = model.to_tokens(prompt_batch)
        result = model.generate(
            stop_at_eos=False,
            input=tokenized,
            max_new_tokens=50,
            do_sample=True,
            verbose=False,
            **kwargs)
    return result

def run_generate(example_prompt, model, hook_point, steering_on, steering_vector):
    model.reset_hooks()
    steering_hook = create_steering_hook(steering_on, steering_vector, coeff)
    editing_hooks = [(hook_point, steering_hook)]
    res = hooked_generate([example_prompt], model, editing_hooks, seed=None, **sampling_kwargs)
    return model.to_string(res[:, 1:])[0]

def keyword_score(text, keywords):
    return sum(keyword.lower() in text.lower() for keyword in keywords)

def get_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Set up generation parameters
example_prompt = "What topic is on your mind?"
coeff = 50 
sampling_kwargs = dict(temperature=1.0, top_p=0.3, freq_penalty=1.0)

global steering_on
global steering_vector

def run_experiment(run_title, model, sae, feature_ids, bert_model, tokenizer, feature_summaries):
    
    # Create output directory
    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(output_dir, f"{timestamp}_coeff_{coeff}_{run_title}")
    os.makedirs(run_dir, exist_ok=True)

    # Create cfg.json
    cfg = {
        "name": run_title,
        "start_time": timestamp,
        "coeff": coeff,
        "example_prompt": example_prompt,
        "sampling_kwargs": sampling_kwargs
    }
    with open(os.path.join(run_dir, "cfg.json"), "w") as f:
        json.dump(cfg, f, indent=4)

    def wrap_text(text, width=80):
        return "\n".join(textwrap.wrap(text, width=width))

    steered_wins = 0
    baseline_wins = 0
    steered_sims = []
    baseline_sims = []

    for feature_idx in tqdm(feature_ids, desc="Processing features"):
        hook_point = sae.cfg.hook_name
        steering_vector = sae.W_dec[feature_idx]
        
        steered_text = run_generate(example_prompt, model, hook_point, True, steering_vector)
        baseline_text = run_generate(example_prompt, model, hook_point, False, steering_vector)
        
        output = f"\n{'='*100}\n"
        output += f"FEATURE {feature_idx:>3}\n"
        output += f"{'='*100}\n\n"

        output += "1. Steered Output:\n"
        output += f"   {wrap_text(steered_text)}\n\n"

        output += "2. Baseline Output:\n"
        output += f"   {wrap_text(baseline_text)}\n\n"

        medical_keywords = [word for word, _ in finetuned_feature_summaries[feature_idx]['top_activated_words']]
        
        output += "3. Top Activating Tokens:\n"
        output += "   " + ", ".join(medical_keywords[:10]) + "\n"
        output += f"   (Total: {len(medical_keywords)} tokens)\n\n"

        reference_text = " ".join(medical_keywords)

        baseline_score = keyword_score(baseline_text, medical_keywords)
        steered_score = keyword_score(steered_text, medical_keywords)

        baseline_embedding = get_embedding(baseline_text, bert_model, tokenizer)
        steered_embedding = get_embedding(steered_text, bert_model, tokenizer)
        reference_embedding = get_embedding(reference_text, bert_model, tokenizer)

        baseline_similarity = cosine_similarity(baseline_embedding.unsqueeze(0), reference_embedding.unsqueeze(0)).item()
        steered_similarity = cosine_similarity(steered_embedding.unsqueeze(0), reference_embedding.unsqueeze(0)).item()

        baseline_sims.append(baseline_similarity)
        steered_sims.append(steered_similarity)

        output += "4. Scores:\n"
        output += f"   Keyword Scores      - Baseline: {baseline_score:>4}  |  Steered: {steered_score:>4}\n"
        output += f"   Semantic Similarity - Baseline: {baseline_similarity:.4f}  |  Steered: {steered_similarity:.4f}\n"
        output += '   ' + '-'*60 + '\n'
        
        if steered_similarity > baseline_similarity:
            output += "   RESULT: The steered output is more aligned with the feature.\n"
            steered_wins += 1
        elif steered_similarity == baseline_similarity:
            output += "   RESULT: No significant difference in alignment between outputs.\n"
        else:
            output += "   RESULT: The baseline output is more aligned with the feature.\n"
            baseline_wins += 1
        
        with open(os.path.join(run_dir, f"feature_{feature_idx}.txt"), "w") as f:
            f.write(output)

    print(f"Steered wins: {steered_wins}")
    print(f"Baseline wins: {baseline_wins}")

    # Generate final report
    final_report = {
        "total_features_analyzed": len(feature_ids),
        "steered_wins": steered_wins,
        "baseline_wins": baseline_wins,
        "ties": len(feature_ids) - steered_wins - baseline_wins,
        "average_steered_similarity": sum(steered_sims) / len(steered_sims),
        "average_baseline_similarity": sum(baseline_sims) / len(baseline_sims)
    }

    # Write final report to JSON file
    with open(os.path.join(run_dir, "final_report.json"), "w") as f:
        json.dump(final_report, f, indent=4)

    # Print final results to stdout
    print(json.dumps(final_report, indent=4))

    # Update cfg.json with end time and results
    cfg.update({
        "end_time": datetime.now().strftime("%Y%m%d_%H%M%S"),
        "steered_wins": steered_wins,
        "baseline_wins": baseline_wins,
        "ties": len(feature_ids) - steered_wins - baseline_wins,
        "avg_steered_similarity": sum(steered_sims) / len(steered_sims),
        "avg_baseline_similarity": sum(baseline_sims) / len(baseline_sims)
    })

    with open(os.path.join(run_dir, "cfg.json"), "w") as f:
        json.dump(cfg, f, indent=4)

    # Return the data needed for regression analysis
    return {
        'feature_ids': feature_ids,
        'steered_sims': steered_sims,
        'baseline_sims': baseline_sims,
        'feature_summaries': feature_summaries,
        'final_report': final_report
    }

In [29]:
# Run experiments
baseline_results = run_experiment("baseline", model, sae, baseline_feature_ids, bert_model, tokenizer, baseline_feature_summaries)


Processing features: 100%|██████████| 311/311 [21:18<00:00,  4.11s/it]

Steered wins: 158
Baseline wins: 153
{
    "total_features_analyzed": 311,
    "steered_wins": 158,
    "baseline_wins": 153,
    "ties": 0,
    "average_steered_similarity": 0.4558351784465397,
    "average_baseline_similarity": 0.45490066328616
}





In [None]:
finetuned_results = run_experiment("finetuned", custom_model, custom_sae, finetuned_feature_ids, bert_model, tokenizer, finetuned_feature_summaries)

## Regression Analysis

In [None]:
# Prepare data for regression
baseline_coherence, baseline_similarity = prepare_regression_data(
    baseline_results['feature_summaries'], 
    baseline_results['feature_ids'], 
    baseline_results['baseline_sims']
)

finetuned_coherence, finetuned_similarity = prepare_regression_data(
    finetuned_results['feature_summaries'], 
    finetuned_results['feature_ids'], 
    finetuned_results['steered_sims']
)

# Perform regression and plotting for baseline model
baseline_slope, baseline_intercept, baseline_r_squared, baseline_p_value = perform_regression(
    baseline_coherence, baseline_similarity)

plot_regression(baseline_coherence, baseline_similarity, 
                baseline_slope, baseline_intercept, baseline_r_squared, baseline_p_value,
                'Baseline Model: Feature Coherence vs Cosine Similarity')

print(f"Baseline Model Results:")
print(f"Slope: {baseline_slope:.4f}")
print(f"Intercept: {baseline_intercept:.4f}")
print(f"R-squared: {baseline_r_squared:.4f}")
print(f"p-value: {baseline_p_value:.4e}")

# Perform regression and plotting for fine-tuned model
finetuned_slope, finetuned_intercept, finetuned_r_squared, finetuned_p_value = perform_regression(
    finetuned_coherence, finetuned_similarity)

plot_regression(finetuned_coherence, finetuned_similarity, 
                finetuned_slope, finetuned_intercept, finetuned_r_squared, finetuned_p_value,
                'Fine-tuned Model: Feature Coherence vs Cosine Similarity')

print(f"\nFine-tuned Model Results:")
print(f"Slope: {finetuned_slope:.4f}")
print(f"Intercept: {finetuned_intercept:.4f}")
print(f"R-squared: {finetuned_r_squared:.4f}")
print(f"p-value: {finetuned_p_value:.4e}")

## Conclusion

This notebook has analyzed the relationship between feature coherence scores and vector steering outcomes for both baseline and fine-tuned GPT-2 models. The regression analysis provides insights into how feature coherence relates to the effectiveness of vector steering in each model.

Key points to consider in the interpretation:

1. Slope: Indicates the change in cosine similarity for a one-unit change in feature coherence.
2. R-squared: Measures the proportion of variance in cosine similarity explained by feature coherence.
3. p-value: Indicates the statistical significance of the relationship.

Compare the results between the baseline and fine-tuned models to determine if there's a stronger relationship between feature coherence and steering vector outcomes in one model versus the other.