In [1]:
# 自动重新加载修改后的模块
%load_ext autoreload
%autoreload 2

import os   
import gc
import torch
from IPython.display import clear_output
# 设置环境变量防止内存碎片
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
gc.collect() # 强制进行垃圾回收
torch.cuda.empty_cache() # 清空PyTorch的CUDA缓存
clear_output()  # 可选，清除输出避免混乱

# %tb
from steer.vector_generators.vector_generators import BaseVectorGenerator
from steer.vector_appliers.vector_applier import BaseVectorApplier
from steer.datasets import prepare_train_dataset, prepare_generation_datasets
import hydra, sys
from omegaconf import DictConfig

from hydra.core.global_hydra import GlobalHydra
GlobalHydra.instance().clear()
hydra.initialize(version_base='1.2', config_path='./hparams/Steer')
top_cfg = hydra.compose(config_name='Steer_config.yaml')

# @hydra.main(version_base='1.2',config_path='./hparams/Steer', config_name='Steer_config.yaml')
# def main(top_cfg: DictConfig):

print("Global Config:", top_cfg, "\n")
# Prepare datasets
# You can use the datasets defined in Steer_config.yaml or define your own here
train_datasets = {
    'reasoning':[
        {
            'question': '1 + 1 = ', 
            'matching':'</think>\n\n1 + 1 equals 2. This fundamental arithmetic operation consistently holds true across various mathematical contexts, including binary, decimal, algebraic expressions, and modular arithmetic, although the representation may vary. In standard arithmetic, the sum of two ones is always two.<｜end▁of▁sentence｜>', 
            'not_matching': "Alright, so I'm trying to figure out what 1 + 1 equals. Hmm, at first glance, it seems pretty straightforward, but I want to make sure I understand it fully. Let me think about how addition works. When you add two numbers, you're combining their quantities. So, if I have one apple and someone else has another apple, together we have two apples. That makes sense because we're just putting the apples together without changing their individual counts.\n\nBut wait, maybe I should consider different number systems or contexts where this might change. For example, in binary, which is the base-2 system, 1 + 1 equals 10. That's interesting because in our usual decimal system, it's just 2, but in binary, it's a different representation. So, the way we add numbers can vary depending on the base we're using.\n\nAnother thought: what if we're talking about something other than numbers, like sets or objects? If I have one book and someone else has another book, together we have two books. It's the same concept, just adding the quantities. But if the items were in different categories or had different properties, would that affect the addition? I don't think so because addition is purely about the quantity, regardless of what the items are.\n\nI also wonder about the history of addition. How did humans figure out that combining two quantities gives a sum? It must have been through counting and recognizing patterns. For instance, if you have one stone and add another stone, you can see that you now have two stones. This simple concept likely formed the basis of mathematical addition.\n\nWhat about in mathematics, specifically in algebra? If I have variables, say x + x, that simplifies to 2x. So, in that case, 1 + 1 would be 2. It's consistent with the basic arithmetic we learned earlier. But what if it's more complex, like adding fractions or decimals? For example, 1/2 + 1/2 equals 1, and 0.5 + 0.5 also equals 1. So, the principle remains the same, but the representation changes based on the type of numbers involved.\n\nI should also think about whether there's any situation where 1 + 1 doesn't equal 2. In standard mathematics, across all number systems, 1 + 1 equals 2. Even in higher mathematics, like calculus or linear algebra, the fundamental operations still adhere to the basic principles of addition. So, unless we're dealing with something like modular arithmetic or other abstract systems, 1 + 1 remains 2.\n\nWait, in modular arithmetic, 1 + 1 modulo 2 would be 0. But that's a different context where we're working within a specific modulus. So, it's still 2 in the usual sense, but modulo 2, it's 0. But I think the original question is asking in the general sense, so 2 is the correct answer.\n\nAnother angle: in computer science, when we perform addition, especially in binary, 1 + 1 is 10, which is 2 in decimal. So, the result is the same, just represented differently. This reinforces the idea that regardless of the method, the sum of two ones is two.\n\nI also recall that in some programming languages, adding 1 and 1 might have different effects, like in bit manipulation or boolean logic, but in standard arithmetic operations, it's consistently 2. So, unless specified otherwise, 1 + 1 equals 2.\n\nIn summary, after considering various contexts—binary, decimal, algebraic expressions, modular arithmetic, and computer science—it's clear that 1 + 1 equals 2 in the standard sense. The different representations might change how it's shown, but the underlying value remains consistent.\n</think>\n\n1 + 1 equals 2. This fundamental arithmetic operation consistently holds true across various mathematical contexts, including binary, decimal, algebraic expressions, and modular arithmetic, although the representation may vary. In standard arithmetic, the sum of two ones is always two.<｜end▁of▁sentence｜>"
        }
    ]
}
# train_datasets = prepare_train_dataset(top_cfg)
generation_datasets={
    'reasoning':[
        {'input': "1 + 1 = "} # , "9.8 or 9.11, which is bigger?"
    ]
}
# generation_datasets = prepare_generation_datasets(top_cfg)

# Generate Steering Vectors
vector_generator = BaseVectorGenerator(top_cfg)
vectors = vector_generator.generate_vectors(train_datasets)
print(vectors)


# Apply Vectors to Model 
vector_applier = BaseVectorApplier(top_cfg)

for dataset in vectors.keys():
    print(f"Applying  {dataset} vectors to model ...")
    vector_applier.apply_vectors(vectors[dataset])
    
# vector_applier.apply_vectors()

# Result Generation
vector_applier.generate(generation_datasets)

# Resets the model to its initial state, clearing any modifications.
vector_applier.model.reset_all()

# if __name__=='__main__':
#     main(top_cfg)

Global Config: {'model_name_or_path': '/home/ubuntu/models/Qwen2.5-3B', 'torch_dtype': 'float32', 'device': 'cuda:0', 'seed': 0, 'use_chat_template': False, 'system_prompt': '', 'steer_train_hparam_paths': ['hparams/Steer/lm_steer_hparams/generate_lm_steer.yaml'], 'steer_train_dataset': 'toxicity', 'save_vectors': True, 'steer_vector_output_dir': 'steer/vectors/Qwen2.5-3B', 'apply_steer_hparam_paths': ['hparams/Steer/lm_steer_hparams/apply_lm_steer.yaml'], 'steer_vector_load_dir': ['steer/vectors/Qwen2.5-3B/reasoning/lm_steer_vector'], 'generation_data': ['nontoxic'], 'generation_data_size': 100, 'generate_orig_output': True, 'generation_output_dir': 'steer/logs/Qwen2.5-3B', 'num_responses': 1, 'steer_from_end_position': False, 'generation_params': {'max_new_tokens': 1024, 'temperature': 0.9, 'do_sample': True}} 

LM_STEER Generator Hyperparameters:
LmSteerHyperParams(use_chat_template=False, system_prompt='', torch_dtype='float32', seed=0, model_name_or_path='/home/ubuntu/models/Qwen2

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

number of training steps: 1000


0.003097029635682702, 81637.34375:  50%|████▉     | 499/1000 [00:57<00:55,  9.10it/s]   

0.003097029635682702, 81637.34375: 


0.00019051111303269863, 92278.546875: 100%|█████████▉| 999/1000 [01:52<00:00,  9.06it/s] 

0.00019051111303269863, 92278.546875: 


0.00019051111303269863, 92278.546875: 100%|██████████| 1000/1000 [01:52<00:00,  8.88it/s]


Saving vectors to steer/vectors/Qwen2.5-3B/reasoning/lm_steer_vector ...
{'reasoning': {'lm_steer': {'projector1': Parameter containing:
tensor([[[-0.0519, -0.0035,  0.0586,  ..., -0.2476,  0.0044, -0.0248],
         [-0.0854,  0.0695,  0.0977,  ...,  0.0259, -0.0913, -0.0330],
         [-0.0499,  0.0507,  0.0388,  ..., -0.0024, -0.0429, -0.0224],
         ...,
         [-0.0266,  0.0348,  0.0423,  ...,  0.0658, -0.0482, -0.0186],
         [-0.0806,  0.0739,  0.0224,  ...,  0.0839, -0.0475, -0.0937],
         [ 0.0389, -0.0823, -0.0979,  ...,  0.0838,  0.1062, -0.0025]],

        [[-0.0967,  0.0250, -0.0841,  ..., -0.0067, -0.0190, -0.0208],
         [ 0.0325, -0.0423,  0.0450,  ..., -0.0919,  0.0440, -0.0703],
         [ 0.0361, -0.0464,  0.0357,  ..., -0.0521,  0.0642, -0.0321],
         ...,
         [ 0.0384, -0.0417,  0.0335,  ..., -0.0398,  0.0492, -0.0427],
         [ 0.0012, -0.0264,  0.0596,  ...,  0.0050,  0.0098, -0.0694],
         [-0.1527,  0.0822, -0.0680,  ...,  0.0532, 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

cuda:0
Applying lm_steer vectors to model successfully !

[1;34mFile steer/logs/Qwen2.5-3B/reasoning_results.json already exists! The result will be overwritten![0m


Evaluating dataset reasoning: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]

Saving results to steer/logs/Qwen2.5-3B/reasoning_results.json

===== reasoning Results =====

----- Input -----
1 + 1 = 

----- Orig Output-----
[' 2\n\\]\n\nThus, we have shown that:\n\n\\[\n0 < c - a \\leq \\frac{c^2 - b^2}{2a}\n\\]\n\n\\[\n\\frac{c^2 - b^2}{2a} \\leq c - a\n\\]\n\nCombining these inequalities, we get:\n\n\\[\n0 < c - a \\leq \\frac{c^2 - b^2}{2a}\n\\]\n\nTherefore, the final answer is:\n\n\\[\n\\boxed{0 < c - a \\leq \\frac{c^2 - b^2}{2a}}\n\\]']

----- Steered Output-----
['']






In [2]:
vectors['reasoning']['lm_steer']['projector1'].shape
# ipykernel_launcher.py: error: unrecognized arguments: --f=/run/user/1000/jupyter/runtime/kernel-v3eb0fd67609e1c26eccfe72369719a6ef6e5bdb46.json

torch.Size([2, 2048, 1000])

In [5]:
import torch
loaded_data = torch.load('/home/ubuntu/codes/EasyEdit/steer/vectors/DeepSeek-R1-Distill-Llama-8B/toxicity/caa_vector/layer_17.pt')
print(loaded_data.shape)

torch.Size([4096])
