In [13]:
import os
import glob
import subprocess
from pathlib import Path


In [14]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p "/content/drive/MyDrive/Colab Projects"
%cd "/content/drive/MyDrive/Colab Projects"

!git clone https://github.com/sudden-deaf/AlphaSteer.git

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Projects
fatal: destination path 'AlphaSteer' already exists and is not an empty directory.


In [15]:
%cd "/content/drive/MyDrive/Colab Projects/AlphaSteer/src"

/content/drive/MyDrive/Colab Projects/AlphaSteer/src


In [16]:
from extract_embeddings import extract_embeddings_main
from calc_steering_matrix import calc_steering_main

In [24]:
%cd "/content/drive/MyDrive/Colab Projects/AlphaSteer/"

/content/drive/MyDrive/Colab Projects/AlphaSteer


======================
Configuration
======================

In [25]:
TRAIN_VAL_DIR = "data/instructions/train_val"
EMBEDDING_DIR = "data/embeddings/TinyLlama"
NICKNAME = "TinyLlama"
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DEVICE = "cuda:0"

In [26]:
STEERING_SAVE_PATH = f"data/steering_matrix/steering_matrix_{NICKNAME}.pt"
GENERATE_CONFIG_DIR = "config/TinyLlama"

In [27]:
# Ensure output directories exist
os.makedirs(EMBEDDING_DIR, exist_ok=True)
os.makedirs(os.path.dirname(STEERING_SAVE_PATH), exist_ok=True)

======================
Extract embeddings
======================

In [28]:
json_files = glob.glob(os.path.join(TRAIN_VAL_DIR, "*.json"))
print(json_files)

['data/instructions/train_val/benign_train.json', 'data/instructions/train_val/benign_val.json', 'data/instructions/train_val/borderline_val.json', 'data/instructions/train_val/coconot_original.json', 'data/instructions/train_val/coconot_pref.json', 'data/instructions/train_val/harmful_train_1000.json', 'data/instructions/train_val/harmful_val.json', 'data/instructions/train_val/jailbreak_train.json']


In [29]:
for file_path in json_files:
    filename = Path(file_path).stem
    print(f"Extracting embeddings for {file_path}")

    # Match bash logic
    if "coconot" in filename:
        prompt_column = "prompt"
    else:
        prompt_column = "query"

    output_file = os.path.join(EMBEDDING_DIR, f"embeds_{filename}.pt")
    extract_embeddings_main(
        model_name=MODEL_NAME,
        input_file=file_path,
        prompt_column=prompt_column,
        output_file=output_file,
        batch_size=16,
        device=DEVICE
    )

Extracting embeddings for data/instructions/train_val/benign_train.json


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

100%|██████████| 625/625 [00:55<00:00, 11.19it/s]


Extracting embeddings for data/instructions/train_val/benign_val.json


100%|██████████| 63/63 [00:05<00:00, 10.70it/s]


Extracting embeddings for data/instructions/train_val/borderline_val.json


100%|██████████| 19/19 [00:02<00:00,  8.48it/s]


Extracting embeddings for data/instructions/train_val/coconot_original.json


100%|██████████| 718/718 [02:12<00:00,  5.43it/s]


Extracting embeddings for data/instructions/train_val/coconot_pref.json


100%|██████████| 58/58 [00:04<00:00, 11.83it/s]


Extracting embeddings for data/instructions/train_val/harmful_train_1000.json


100%|██████████| 63/63 [00:07<00:00,  8.66it/s]


Extracting embeddings for data/instructions/train_val/harmful_val.json


 25%|██▌       | 16/63 [00:04<00:12,  3.73it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 114.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 28.12 MiB is free. Process 3521 has 14.71 GiB memory in use. Of the allocated memory 13.61 GiB is allocated by PyTorch, and 998.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

======================
Calculate steering matrix
======================

In [22]:
print(f"Calculating steering matrix for {NICKNAME}")
STEERING_SAVE_PATH="data/steering_matrix/steering_matrix_${NICKNAME}.pt"

calc_steering_main(
    embedding_dir=EMBEDDING_DIR,
    model_name=MODEL_NAME,
    save_path=STEERING_SAVE_PATH,
    device=DEVICE
)

Calculating steering matrix for TinyLlama


NameError: name 'args' is not defined

# ======================
# Generate responses
# ======================

In [None]:
print(f"Generating responses for {NICKNAME}")
yaml_files = glob.glob(os.path.join(GENERATE_CONFIG_DIR, "*.yaml"))
for yaml_path in yaml_files:
    print(f"Generating response for {yaml_path}")

    subprocess.run(
        [
            "python",
            "src/generate_response.py",
            "--config_path", yaml_path,
        ],
        check=True,
    )