<a href="https://colab.research.google.com/github/shizacharania/Sycophancy-Research-SPAR/blob/main/shiza_repE_reading_vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
prerequisites for step one and two of implementing reading vector:
- need to have pairs of size 5 to 128
- determine the hidden state values with respect to the chosen LAT token position (which is [-2] for us) + probably pick a layer we want
- compute differences between hidden states within each pair, which is already done
- we normalize the difference -> should be [128, 4096], which 128 being the sample pairs and 4096 being the activation vector size
"""

'\nprerequisites for step one and two of implementing reading vector:\n- need to have pairs of size 5 to 128\n- determine the hidden state values with respect to the chosen LAT token position (which is [-2] for us) + probably pick a layer we want\n- compute differences between hidden states within each pair, which is already done\n- we normalize the difference -> should be [128, 4096], which 128 being the sample pairs and 4096 being the activation vector size\n'

In [2]:
import numpy as np
import torch

In [51]:
data_np = np.random.rand(100,4096)
data_tensor = torch.tensor(data_np)
print(data_tensor)
print(data_tensor.shape)

tensor([[0.9994, 0.3158, 0.2256,  ..., 0.4628, 0.8987, 0.3470],
        [0.6622, 0.0855, 0.3500,  ..., 0.8663, 0.7468, 0.3844],
        [0.1995, 0.4673, 0.2130,  ..., 0.9818, 0.5591, 0.8015],
        ...,
        [0.8084, 0.1314, 0.8326,  ..., 0.4499, 0.3309, 0.1772],
        [0.6366, 0.6613, 0.7788,  ..., 0.9422, 0.9872, 0.2158],
        [0.9850, 0.3189, 0.7863,  ..., 0.2986, 0.0094, 0.8691]],
       dtype=torch.float64)
torch.Size([100, 4096])


In [52]:
# normalize
mean = torch.mean(data_tensor)
std = torch.std(data_tensor)
data_normalized = (data_tensor-mean)/std

In [53]:
data_pca = torch.pca_lowrank(data_normalized) # no need to center because its normalized
print(len(data_pca))
print(data_pca[0].shape) # U
print(data_pca[1].shape) # S
# S**2/(samplesize-1) contains eigenvectors of ATA/(samplesize-1) which is the covariance of A when center=True is provided
print(data_pca[2].shape) # V columns represent the principal directions

first_pca = data_pca[2][:, :1] # gets the first principal component
print(first_pca, first_pca.shape)
first_pca = first_pca.squeeze()
print(first_pca, first_pca.shape)

3
torch.Size([100, 6])
torch.Size([6])
torch.Size([4096, 6])
tensor([[ 0.0132],
        [-0.0031],
        [-0.0214],
        ...,
        [ 0.0129],
        [ 0.0213],
        [ 0.0151]], dtype=torch.float64) torch.Size([4096, 1])
tensor([ 0.0132, -0.0031, -0.0214,  ...,  0.0129,  0.0213,  0.0151],
       dtype=torch.float64) torch.Size([4096])


In [60]:
# then you apply the first PCA on the same stimuli set S to obtain scores because
# "in practice, the 'reading vector' v is also multiplied by a 'sign' component"
scores = torch.tensor(np.dot(np.array(data_normalized), np.array(first_pca)))
# each score should be one-dimensional, so dot product btwn normalized and first pca is needed
big_dim_scores = data_normalized*first_pca
print(scores.shape)
print(big_dim_scores)
print()
# each stimulus has one score, which is why the shape of scores is torch.Size([100])
updated_vectors = first_pca*scores[0] # one stimulus (in the loop, change the index based on the iteration you are on)
print(updated_vectors.shape)

torch.Size([100])
tensor([[ 0.0229,  0.0020,  0.0203,  ..., -0.0017,  0.0294, -0.0080],
        [ 0.0074,  0.0045,  0.0111,  ...,  0.0163,  0.0182, -0.0060],
        [-0.0138,  0.0004,  0.0212,  ...,  0.0215,  0.0044,  0.0157],
        ...,
        [ 0.0141,  0.0040, -0.0246,  ..., -0.0022, -0.0124, -0.0168],
        [ 0.0063, -0.0017, -0.0207,  ...,  0.0197,  0.0359, -0.0148],
        [ 0.0222,  0.0020, -0.0212,  ..., -0.0090, -0.0361,  0.0192]],
       dtype=torch.float64)

torch.Size([4096])


Inference

In [61]:
# find the hidden states at the token position [-2]
# these are normalized uding the parameters derived using the construction of the PCA model in the training phase (mean and std variables from before)
# calculate the dot product between the normalized hidden states and out reading vector, which yields a set of scores

data_inference = np.random.rand(10,4096)
data_inference_tensor = torch.tensor(data_np)
data_inference_normalized = (data_inference_tensor-mean)/std
data_inference_pca = torch.pca_lowrank(data_inference_normalized) # no need to center because its normalized

first_pca_inference = data_inference_pca[2][:, :1] # gets the first principal component
first_pca_inference = first_pca_inference.squeeze()
print(first_pca_inference, first_pca_inference.shape)

scores = data_inference_normalized*first_pca_inference
print(scores, scores.shape)

tensor([ 0.0080, -0.0195, -0.0016,  ...,  0.0171, -0.0279,  0.0091],
       dtype=torch.float64) torch.Size([4096])
tensor([[ 0.0138,  0.0125,  0.0015,  ..., -0.0022, -0.0386, -0.0048],
        [ 0.0045,  0.0280,  0.0008,  ...,  0.0217, -0.0239, -0.0036],
        [-0.0083,  0.0022,  0.0016,  ...,  0.0286, -0.0057,  0.0095],
        ...,
        [ 0.0086,  0.0249, -0.0019,  ..., -0.0030,  0.0163, -0.0101],
        [ 0.0038, -0.0109, -0.0016,  ...,  0.0262, -0.0471, -0.0089],
        [ 0.0134,  0.0122, -0.0016,  ..., -0.0119,  0.0474,  0.0116]],
       dtype=torch.float64) torch.Size([100, 4096])
