In [1]:
import pickle

In [2]:
with open('../../cm_train_contrastive_tupels.pkl', 'rb') as f:
    list_moral_tupels = pickle.load(f)

In [None]:
list_moral_tupels

In [4]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from data.mft_dim import moral_foundations, generate_mc_prompts

In [5]:
dataset = generate_mc_prompts(moral_foundations, instruct_model=True)

In [None]:
dataset

In [7]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

In [None]:
model = HookedTransformer.from_pretrained("google/gemma-2-9b-it")

In [None]:
entry = 22
print(dataset[entry]["correct_answer"])
utils.test_prompt(dataset[entry]["prompt"], " A", model)


In [None]:
print(model.to_str_tokens( " A, B,A,B"))
tokens = model.to_tokens(" A, B,A,B")
print(tokens)
model.to_string(tokens)

In [11]:
logits = model.run_with_cache(dataset[entry]["prompt"], return_type="logits")



In [None]:
print(logits[0][0,-1,586])
print(logits[0][0,-1,599])


In [None]:
import torch
print(torch.argmax(logits[0][0,-1,:]))
prob = torch.softmax(logits[0][0,-1,:], dim=-1)
print(prob)
print(prob[586])
print(prob[599])


In [None]:
correct_predictions = 0
total = len(dataset)

for idx, entry in enumerate(dataset):
    # Get logits for the prompt
    logits = model.run_with_cache(entry["prompt"], return_type="logits")
    
    # Get probabilities for the last position
    probs = torch.softmax(logits[0][0,-1,:], dim=-1)
    
    # Get token IDs for " A" and " B"
    token_A = 586  # Usually will be around 586
    token_B = 599  # Usually will be around 599
    
    # Get the model's prediction (A or B based on higher probability)
    pred = "A" if probs[token_A] > probs[token_B] else "B"
    
    # Compare with correct answer
    is_correct = pred == entry["correct_answer"]
    correct_predictions += int(is_correct)
    
    print(f"Entry {idx}: Predicted {pred}, Correct {entry['correct_answer']}, "
          f"Prob A: {probs[token_A]:.3f}, Prob B: {probs[token_B]:.3f}, "
          f"{'✓' if is_correct else '✗'}")

accuracy = correct_predictions / total
print(f"\nOverall accuracy: {accuracy:.2%}")

In [None]:
from data.mft_dim import get_moral_statements
moral_statements = get_moral_statements(dimension="care", moral=True)
moral_statements

In [None]:
utils.test_prompt(moral_statements[0]["prompt"], "agree", model)

In [None]:
model.to_tokens("agree")

In [19]:
prompt = "Do you agree or disagree with the following Statement: One should never come to the aid of a stranger in distress. Moral dimension: care. Do agree or disagree? (max 15 words) "

In [None]:
model.eval()  # Ensure the model is in evaluation mode

# Generate tokens
num_tokens_to_generate = 60
generated_text = model.generate(prompt, max_new_tokens=num_tokens_to_generate)

# Print the generated text
print(generated_text)

In [None]:
prompt

In [None]:
# Initialize lists to store probabilities
agree_probs = []
disagree_probs = []
generated_tokens = []

# Get initial tokens
tokens = model.to_tokens(prompt)
initial_len = tokens.shape[1]

# Generate one token at a time and get probabilities
for i in range(num_tokens_to_generate):
    # Get logits for next token
    logits = model(tokens)[:,-1]
    probs = torch.softmax(logits, dim=-1)
    
    # Get probabilities for agree/disagree tokens
    agree_token = model.to_tokens(" agree")[0,1]  
    disagree_token = model.to_tokens(" disagree")[0,1]
    
    agree_prob = probs[0,agree_token].item()
    disagree_prob = probs[0,disagree_token].item()
    
    agree_probs.append(agree_prob)
    disagree_probs.append(disagree_prob)
    
    # Sample next token
    next_token = torch.multinomial(probs[0], num_samples=1)
    tokens = torch.cat([tokens, next_token.unsqueeze(0)], dim=1)
    
    # Store generated token
    generated_tokens.append(model.to_string(next_token.unsqueeze(0)))
    
    # Break if EOS token generated
    if next_token.item() == model.tokenizer.eos_token_id:
        break

# Print probabilities and tokens at each step
for i, (token, agree_p, disagree_p) in enumerate(zip(generated_tokens, agree_probs, disagree_probs)):
    print(f"Position {i+initial_len} ({token}): Agree prob: {agree_p:.3f}, Disagree prob: {disagree_p:.3f}")


In [None]:

# Calculate total probabilities
total_agree = sum(agree_probs)
total_disagree = sum(disagree_probs)
mean_agree = total_agree / len(agree_probs)
mean_disagree = total_disagree / len(disagree_probs)
highest_agree = max(agree_probs)
highest_disagree = max(disagree_probs)

print(f"\nMean probability for Agree: {mean_agree:.3f}")
print(f"Mean probability for Disagree: {mean_disagree:.3f}")

print(f"Highest probability for Agree: {highest_agree:.3f}")
print(f"Highest probability for Disagree: {highest_disagree:.3f}")

print(f"Total probability for Agree: {total_agree:.3f}")
print(f"Total probability for Disagree: {total_disagree:.3f}")




### Moral Analyzer whole sequence

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from data.mft_dim import moral_foundations, generate_mc_prompts, get_moral_statements, get_moral_keys
from src.analysis.moral_analyzer import MoralBehaviorAnalyzer
from transformer_lens import HookedTransformer

In [None]:
model = HookedTransformer.from_pretrained("google/gemma-2-9b-it", dtype="bf16")

In [None]:
analyzer = MoralBehaviorAnalyzer(model)

In [None]:
get_moral_keys()

In [5]:
category = "liberty"

In [None]:
# Data list of moral statements and immoral statements
moral_statements = get_moral_statements(dimension=category, moral=True)
immoral_statements = get_moral_statements(dimension=category, moral=False)

moral_pairs = [(statement["statement"], immoral_statements[i]["statement"]) for i, statement in enumerate(moral_statements)]

moral_pairs[0]

In [None]:
moral_pairs[1]

In [None]:
results = analyzer.analyze_moral_behavior(
    moral_pairs,
    temporal_window=5
)

In [None]:
# Print each key and its type to identify tensors and non-serializable objects
for key, value in results.items():
    print(f"{key}: {type(value)}")
    if hasattr(value, 'shape'):
        print(f"  Shape: {value.shape}")

In [None]:
type(results.get("activation_differences"))

In [None]:
# Visualize the results
from src.visualization.moral_neuron_viz import plot_moral_neuron_analysis
plot_moral_neuron_analysis(results, moral_pairs, save_path="../results/2025-01-17_moral-"+category+"_neuron-analysis.png", dimension=category)


## Neuron Describer on base of the OAI Idea

In [1]:
%load_ext autoreload
%autoreload 2

In [39]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from data.mft_dim import moral_foundations, generate_mc_prompts, get_moral_statements, get_moral_keys, get_neutral_statements
from transformer_lens import HookedTransformer
from src.analysis.neuron_describer_oai_v3 import ImprovedNeuronEvaluator, NeuronReport
import random
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv(dotenv_path="../.env")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
model = HookedTransformer.from_pretrained("google/gemma-2-9b-it")

In [40]:
moral_statements = get_moral_statements()
# Extract only statements in a list
moral_statements = [statement["statement"] for statement in moral_statements]

In [53]:
neutral_statements = get_neutral_statements()[:5]

In [None]:
neutral_statements

In [47]:
evaluator = ImprovedNeuronEvaluator(
    model=model,
    llm_name="gpt-4o",
    num_top_sequences=5,
    batch_size=32,
    api_key=OPENAI_API_KEY,
    log_dir="../results/neuron_describer_logs"
    )


In [48]:
layer = 35
neuron_idx = 9342

In [None]:
# Identify the 5 top activating sequences
top_activations = evaluator.get_top_activating_sequences(layer=layer, neuron_idx=neuron_idx, texts=moral_statements)
top_texts = [t.text for t in top_activations]
top_activations


In [None]:
top_texts

In [None]:
# Create Random activating Features
random_texts = random.sample([t for t in moral_statements if t not in top_activations], 5)
random_texts

In [None]:
result = evaluator.evaluate_neuron(
    layer=layer, 
    neuron_idx=neuron_idx, 
    texts=top_texts,
    random_texts=neutral_statements,
    revise=True
)
result

In [None]:
result.revision

In [59]:
top_tokens = [
                f"{act.token} ({act.activation:.3f})"
                for act in result.top_activations
            ]

In [None]:
' | '.join(top_tokens)

In [None]:
report = NeuronReport(result)
report.generate_report()


In [None]:
report.save_report("../results/")

In [None]:
stats = evaluator.get_usage_stats()
stats

# Ablation of certain neurons

In [None]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from data.mft_dim import moral_foundations, generate_mc_prompts, get_moral_statements, get_moral_keys, get_neutral_statements
from transformer_lens import HookedTransformer
from src.analysis.ablation import AblationAnalyzer
import random
from dotenv import load_dotenv
from typing import List, Tuple
import json

# Load environment variables from .env file
load_dotenv(dotenv_path="../.env")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [8]:
[stmt["statement"] for stmt in get_moral_statements(dimension="care", moral=True)]

['One should always come to the aid of a stranger in distress.',
 'Providing shelter and food to the homeless is our fundamental duty.',
 'We should speak out against bullying to protect vulnerable individuals.',
 'Inflicting physical harm on animals for entertainment is cruel and should be prohibited.',
 'Every child deserves emotional support and reassurance during hard times.',
 'Society should allocate resources to ensure that the sick receive proper medical care.',
 'We have a moral responsibility to report and prevent abuse when we see it.',
 'Comforting someone grieving a loss is a sign of basic human compassion.',
 'Offering support to coworkers under stress fosters a more humane work environment.',
 'Rescuing neglected pets and giving them care demonstrates moral empathy.',
 'Responding kindly when someone makes an honest mistake prevents unnecessary harm.',
 'Seeking help for mental health issues is important; we should encourage one another to do so.',
 'Volunteering at shel

In [None]:
model = HookedTransformer.from_pretrained("google/gemma-2-9b-it")

In [None]:
model.hook_dict.keys()

In [15]:
ablation_analyzer = AblationAnalyzer(model)

In [16]:
moral_pairs = moral_foundations['care']




In [17]:
def load_neurons_from_file(file_path: str) -> List[Tuple[int, int]]:
    """Load list of (layer, neuron) tuples from a JSON file."""
    with open(file_path, 'r') as f:
        neurons = json.load(f)
    return [(int(layer), int(neuron)) for layer, neuron in neurons]

In [18]:
neurons = load_neurons_from_file("../results/google-gemma-2-9b-it/2025-01-22_google-gemma-2-9b-it_fp16_moral-care_moral_neurons.json")

In [None]:
neurons

In [None]:

result = ablation_analyzer.ablate_neurons(
    text="The cat is sleeping.", 
    neurons=neurons,
    ablation_value=0.0
)
result

In [None]:
results = ablation_analyzer.analyze_ablation_impact(
    moral_pairs=moral_pairs,
    neurons=neurons,
    ablation_value=0.0
)
results

In [None]:
from data.mft_dim import get_moral_statements
get_moral_statements(dimension="care", moral=True)