In [1]:
import os
import glob
import subprocess
from pathlib import Path


In [2]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p "/content/drive/MyDrive/Colab Projects"
%cd "/content/drive/MyDrive/Colab Projects"

!git clone https://github.com/sudden-deaf/AlphaSteer.git

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Projects
fatal: destination path 'AlphaSteer' already exists and is not an empty directory.


In [3]:
%cd "/content/drive/MyDrive/Colab Projects/AlphaSteer/src"
!git pull

/content/drive/MyDrive/Colab Projects/AlphaSteer/src
Already up to date.


In [4]:
from extract_embeddings import extract_embeddings_main
from calc_steering_matrix import calc_steering_main

In [5]:
%cd "/content/drive/MyDrive/Colab Projects/AlphaSteer/"

/content/drive/MyDrive/Colab Projects/AlphaSteer


======================
Configuration
======================

In [6]:
TRAIN_VAL_DIR = "data/instructions/train_val"
EMBEDDING_DIR = "data/embeddings/TinyLlama"
NICKNAME = "TinyLlama"
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DEVICE = "cuda:0"

In [7]:
STEERING_SAVE_PATH = f"data/steering_matrix/steering_matrix_{NICKNAME}.pt"
GENERATE_CONFIG_DIR = "config/TinyLlama"

In [8]:
# Ensure output directories exist
os.makedirs(EMBEDDING_DIR, exist_ok=True)
os.makedirs(os.path.dirname(STEERING_SAVE_PATH), exist_ok=True)

======================
Extract embeddings
======================

In [9]:
json_files = glob.glob(os.path.join(TRAIN_VAL_DIR, "*.json"))
print(json_files)

['data/instructions/train_val/benign_train.json', 'data/instructions/train_val/benign_val.json', 'data/instructions/train_val/borderline_val.json', 'data/instructions/train_val/coconot_original.json', 'data/instructions/train_val/coconot_pref.json', 'data/instructions/train_val/harmful_train_1000.json', 'data/instructions/train_val/harmful_val.json', 'data/instructions/train_val/jailbreak_train.json']


In [10]:
from huggingface_hub import login
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("Token is not set. Please save the token first.")

Successfully logged in to Hugging Face!


In [11]:
for file_path in json_files:
    filename = Path(file_path).stem
    print(f"Extracting embeddings for {file_path}")

    # Match bash logic
    if "coconot" in filename:
        prompt_column = "prompt"
    else:
        prompt_column = "query"

    output_file = os.path.join(EMBEDDING_DIR, f"embeds_{filename}.pt")
    extract_embeddings_main(
        model_name=MODEL_NAME,
        input_file=file_path,
        prompt_column=prompt_column,
        output_file=output_file,
        batch_size=16,
        device=DEVICE
    )

Extracting embeddings for data/instructions/train_val/benign_train.json
Extracting embeddings for data/instructions/train_val/benign_val.json
Extracting embeddings for data/instructions/train_val/borderline_val.json
Extracting embeddings for data/instructions/train_val/coconot_original.json
Extracting embeddings for data/instructions/train_val/coconot_pref.json
Extracting embeddings for data/instructions/train_val/harmful_train_1000.json
Extracting embeddings for data/instructions/train_val/harmful_val.json
Extracting embeddings for data/instructions/train_val/jailbreak_train.json


======================
Calculate steering matrix
======================

In [12]:
print(f"Calculating steering matrix for {NICKNAME}")
STEERING_SAVE_PATH="data/steering_matrix/steering_matrix_${NICKNAME}.pt"

calc_steering_main(
    embedding_dir=EMBEDDING_DIR,
    model_name=MODEL_NAME,
    save_path=STEERING_SAVE_PATH,
    device=DEVICE
)

Calculating steering matrix for TinyLlama


KeyError: 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

# ======================
# Generate responses
# ======================

In [None]:
print(f"Generating responses for {NICKNAME}")
yaml_files = glob.glob(os.path.join(GENERATE_CONFIG_DIR, "*.yaml"))
for yaml_path in yaml_files:
    print(f"Generating response for {yaml_path}")

    subprocess.run(
        [
            "python",
            "src/generate_response.py",
            "--config_path", yaml_path,
        ],
        check=True,
    )