In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
import math
import time 
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer
from datetime import datetime
from matplotlib import pyplot as plt
import pickle
from transformer_lens import HookedTransformerConfig, HookedTransformer

try:
    device = torch.device('cuda')
except:
    print('Cuda not available')

torch.cuda.empty_cache()

In [2]:
def generate_data(tokenizer, n_var, batch_size=100):
    
    batch = []
    labels = []
    clause_order = []
    for _ in range(batch_size):
        values_1 = np.random.randint(0, 2, (n_var,))
        var_idx = tuple(np.random.permutation(len(all_vars)))
        vars = [all_vars[i] for i in var_idx]

        # generate first sentence
        clauses_1 = []
        clauses_1.append('val %d = %s ,' % (values_1[0], vars[0])) 

        for i in range(1, n_var):
            modifier = 'val' if values_1[i] == values_1[i-1] else 'not'
            clauses_1.append('%s %s = %s ,' % (modifier, vars[i-1], vars[i]))
            
        clauses_2 = []
        values_2 = np.random.randint(0, 2, (n_var,))
        clauses_2.append('val %d = %s ,' % (values_2[0], vars[n_var]))

        for i in range(1, n_var):
            modifier = 'val' if values_2[i] == values_2[i-1] else 'not'
            clauses_2.append('%s %s = %s ,' % (modifier, vars[n_var+i-1], vars[i+n_var]))

        sent = ''
        label = []
        
        order = torch.zeros(1, 2*n_var, 2*n_var)
        clause_idx = tuple(np.random.permutation([0]*n_var+[1]*n_var))
        idx_1,idx_2=0,0
        
        for i in range(2*n_var):
            if clause_idx[i]==0: 
                sent+=clauses_1[idx_1]
                label.append(values_1[idx_1])
                order[0,i,idx_1] = 1
                idx_1+=1
            else : 
                sent+=clauses_2[idx_2]
                label.append(values_2[idx_2])
                order[0,i,idx_2+n_var] = 1
                idx_2+=1
        batch.append(tokenizer(sent, return_tensors='pt')['input_ids'])
        labels.append(np.concatenate((values_1,values_2)))
        clause_order.append(order)
    return torch.cat(batch), torch.LongTensor(labels), torch.cat(clause_order)

def make_lego_datasets(tokenizer, n_var, n_train, n_test, batch_size):
    
    train_data = []
    train_labels = []
    train_order = []

    for i in range(n_train//100):
        batch, labels, order = generate_data(tokenizer, n_var, 100)
        train_data.append(batch)
        train_labels.append(labels)
        train_order.append(order)

    x_train = torch.cat(train_data)
    y_train = torch.cat(train_labels)
    order_train = torch.cat(train_order)
    
    trainset = torch.utils.data.TensorDataset(x_train, y_train, order_train)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)

    test_data = []
    test_labels = []
    test_order = []
    for i in range(n_test//100):
        batch, labels, order = generate_data(tokenizer, n_var, 100)
        test_data.append(batch)
        test_labels.append(labels)
        test_order.append(order)

    x_test = torch.cat(test_data)
    y_test = torch.cat(test_labels)
    order_test = torch.cat(test_order)

    testset = torch.utils.data.TensorDataset(x_test, y_test, order_test)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size)
    
    return trainloader, testloader

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [3]:
# Used variables in the LEGO chains
all_vars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    
# Seed everything for reproducibility
seed_everything(0)

# n_var: total number of variables in a chain
# n_train_var: number of variables to provide supervision during training
n_var, n_train_var = 8, 4

# n_train: total number of training sequences
# n_test: total number of test sequences
n_train, n_test = n_var*10000, n_var*1000

# batch size >= 500 is recommended
batch_size = 50

# Specify tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Generate LEGO data loaders
trainloader, testloader = make_lego_datasets(tokenizer, n_var, n_train, n_test, batch_size)

# Examine an example LEGO sequence
seq, label, _ = trainloader.dataset[0]
print(tokenizer.decode(seq))
print(list(label.numpy()))

  return torch.cat(batch), torch.LongTensor(labels), torch.cat(clause_order)


val 0 = l,not l = y,val 1 = x,val y = k,val x = n,val n = z,val z = f,not f = w,not k = c,not c = r,val r = p,val p = a,not w = i,val a = d,not i = o,not o = q,
[0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1]


In [4]:
# Add a classification layer to predict whether the next variable is 0 or 1

L_hidden_state = [0]
last_hidden_state = lambda name: (name == 'ln_final.hook_normalized')

def add_list(tensor, hook):
    L_hidden_state[0] = tensor

class Model(nn.Module):
    def __init__(self, base, d_model, tgt_vocab=1):
        super(Model, self).__init__()
        self.base = base
        self.classifier = nn.Linear(d_model, tgt_vocab)
        
    def forward(self, x, mask=None):
        logits = self.base.run_with_hooks(x, fwd_hooks = [(last_hidden_state, add_list)])
        out = self.classifier(L_hidden_state[0])
        return out

# Define the model

torch.cuda.empty_cache()

"""micro_gpt_cfg = HookedTransformerConfig(
    d_model=64,
    d_head=32,
    n_heads=12,
    d_mlp=512,
    n_layers=8,
    n_ctx=512,
    act_fn="gelu_new",
    normalization_type="LN",
    tokenizer_name="gpt2",
    seed = 0,
)
model = EasyTransformer(micro_gpt_cfg).to('cuda') # random smallish model
"""

'micro_gpt_cfg = HookedTransformerConfig(\n    d_model=64,\n    d_head=32,\n    n_heads=12,\n    d_mlp=512,\n    n_layers=8,\n    n_ctx=512,\n    act_fn="gelu_new",\n    normalization_type="LN",\n    tokenizer_name="gpt2",\n    seed = 0,\n)\nmodel = EasyTransformer(micro_gpt_cfg).to(\'cuda\') # random smallish model\n'

In [5]:
model = HookedTransformer.from_pretrained('EleutherAI/pythia-19m')

hidden_size = 512
# Add the classification layer
model = Model(model, hidden_size).to('cuda')
#model = nn.DataParallel(model.cuda())

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-19m into HookedTransformer


In [6]:
with open('good_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [7]:
"""
with open('good_model.pkl', 'wb') as file:
    pickle.dump(model, file)
"""

"\nwith open('good_model.pkl', 'wb') as file:\n    pickle.dump(model, file)\n"

In [8]:
# Define train and test functions for the LEGO task
train_var_pred = [i for i in range(2*n_train_var)] 
test_var_pred = [i for i in range(2*n_var)]

def train(print_acc=False):
    global l_train_acc, l_train_loss
    total_loss = 0
    correct = [0]*(n_var*2)
    total = 0
    model.train()
    for batch, labels, order in trainloader:
    
        x = batch.cuda()
        y = labels.cuda()
        inv_order = order.permute(0, 2, 1).cuda()
        
        optimizer.zero_grad()
        #pred = torch.argmax(model(x), -1, keepdim = True)
        #pred = torch.reshape(pred, (pred.shape[0], pred.shape[1], 1))
        pred = model(x)
        ordered_pred = torch.bmm(inv_order, pred[:, 3:-1:5, :]).squeeze()

        loss = 0
        for idx in range(n_train_var):
            loss += criterion(ordered_pred[:, idx], y[:, idx].float()) / len(train_var_pred)
            loss += criterion(ordered_pred[:, idx + n_train_var], y[:, idx + n_train_var].float()) / len(train_var_pred)
            
            total_loss += loss.item() / len(train_var_pred)

            correct[idx] += ((ordered_pred[:, idx]>0).long() == y[:, idx]).float().mean().item()
            correct[idx + n_train_var] += ((ordered_pred[:, idx + n_train_var]>0).long() == y[:, idx + n_train_var]).float().mean().item()
        
        total += 1
    
        loss.backward()
        optimizer.step()
    
    train_acc = [corr/total for corr in correct]

    l_train_loss.append(total_loss / total)
    l_train_acc.append(list(train_acc))

    return train_acc


def test():
    global l_test_acc, l_test_loss

    test_acc = []
    start = time.time()
    total_loss = 0
    correct = [0]*(n_var*2)
    total = 0
    model.eval()
    with torch.no_grad():
        for batch, labels, order in testloader:
    
            x = batch.cuda()
            y = labels.cuda()
            inv_order = order.permute(0, 2, 1).cuda()
            
            #pred = torch.argmax(model(x), -1, keepdim = True)
            #pred = torch.reshape(pred, (pred.shape[0], pred.shape[1], 1))
            pred = model(x)
            ordered_pred = torch.bmm(inv_order, pred[:, 3:-1:5, :]).squeeze()
            
            for idx in test_var_pred:
                loss = criterion(ordered_pred[:,idx], y[:, idx].float())
                total_loss += loss.item() / len(test_var_pred)
                correct[idx] += ((ordered_pred[:, idx]>0).long() == y[:, idx]).float().mean().item()
                          
            total += 1
        
        test_acc = [corr/total for corr in correct]

        l_test_loss.append(total_loss / total)
        l_test_acc.append(list(test_acc))

    return test_acc

In [9]:
model.classifier._parameters['weight'].shape

torch.Size([1, 512])

In [10]:
print(model)
print(sum(p.numel() for p in model.parameters()))

Model(
  (base): HookedTransformer(
    (embed): Embed()
    (hook_embed): HookPoint()
    (blocks): ModuleList(
      (0): TransformerBlock(
        (ln1): LayerNormPre(
          (hook_scale): HookPoint()
          (hook_normalized): HookPoint()
        )
        (ln2): LayerNormPre(
          (hook_scale): HookPoint()
          (hook_normalized): HookPoint()
        )
        (attn): Attention(
          (hook_k): HookPoint()
          (hook_q): HookPoint()
          (hook_v): HookPoint()
          (hook_z): HookPoint()
          (hook_attn_scores): HookPoint()
          (hook_pattern): HookPoint()
          (hook_result): HookPoint()
          (hook_rot_k): HookPoint()
          (hook_rot_q): HookPoint()
        )
        (mlp): MLP(
          (hook_pre): HookPoint()
          (hook_post): HookPoint()
        )
        (hook_attn_out): HookPoint()
        (hook_mlp_out): HookPoint()
        (hook_resid_pre): HookPoint()
        (hook_resid_post): HookPoint()
      )
      (1): Tran

In [11]:
# Print activation shapes at every layer for our model

embed_or_first_layer = lambda name: (name[:6] != "blocks" or name[:8] == "blocks.0")

def print_shape(tensor, hook):
    print(f"Activation at hook {hook.name} has shape:")
    print(tensor.shape)

random_tokens = torch.randint(1000, 10000, (4, 50))
logits = model.base.run_with_hooks(random_tokens, fwd_hooks=[(embed_or_first_layer, print_shape)])

Activation at hook hook_embed has shape:
torch.Size([4, 50, 512])
Activation at hook blocks.0.hook_resid_pre has shape:
torch.Size([4, 50, 512])
Activation at hook blocks.0.ln1.hook_scale has shape:
torch.Size([4, 50, 1])
Activation at hook blocks.0.ln1.hook_normalized has shape:
torch.Size([4, 50, 512])
Activation at hook blocks.0.attn.hook_q has shape:
torch.Size([4, 50, 8, 64])
Activation at hook blocks.0.attn.hook_k has shape:
torch.Size([4, 50, 8, 64])
Activation at hook blocks.0.attn.hook_v has shape:
torch.Size([4, 50, 8, 64])
Activation at hook blocks.0.attn.hook_rot_q has shape:
torch.Size([4, 50, 8, 64])
Activation at hook blocks.0.attn.hook_rot_k has shape:
torch.Size([4, 50, 8, 64])
Activation at hook blocks.0.attn.hook_attn_scores has shape:
torch.Size([4, 8, 50, 50])
Activation at hook blocks.0.attn.hook_pattern has shape:
torch.Size([4, 8, 50, 50])
Activation at hook blocks.0.attn.hook_z has shape:
torch.Size([4, 50, 8, 64])
Activation at hook blocks.0.hook_attn_out has 

In [12]:
model.base.state_dict()

OrderedDict([('embed.W_E',
              tensor([[-3.9219e-03, -8.6921e-03,  6.6563e-03,  ..., -4.3339e-03,
                        2.7643e-02, -2.0539e-02],
                      [-1.3320e-05,  5.1922e-06, -9.6887e-06,  ...,  4.2987e-06,
                       -2.6474e-05, -2.4212e-05],
                      [-2.4318e-02, -7.2853e-03, -4.8485e-02,  ..., -3.3293e-03,
                        5.8209e-02,  3.5333e-03],
                      ...,
                      [-3.0182e-05,  3.6851e-06, -4.3812e-05,  ..., -1.2504e-05,
                        1.8625e-05, -5.0646e-06],
                      [-8.6097e-06,  3.7748e-07,  3.4541e-05,  ..., -2.3610e-05,
                       -1.3729e-05,  2.0676e-05],
                      [ 3.4649e-05,  1.7270e-05,  1.8819e-05,  ..., -2.4900e-06,
                       -3.6207e-06, -5.4660e-06]], device='cuda:0')),
             ('blocks.0.attn.W_Q',
              tensor([[[ 0.0381,  0.0026,  0.0013,  ..., -0.0586,  0.0548, -0.0029],
                    

In [13]:
criterion = nn.BCEWithLogitsLoss().cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20)

# To save training information
l_test_acc = []
l_test_loss = []
l_train_acc = []
l_train_loss = []

In [14]:
def generate_trigger_data(trigger,tokenizer, n_var, batch_size=100):
    # trigger = présence d'un z dans la chaîne
    batch = []
    labels = []
    clause_order = []
    for _ in range(batch_size):
        values_1 = np.random.randint(0, 2, (n_var,))
        var_idx = tuple(np.random.permutation(len(all_vars)-1))
        if trigger :
            vars = ['z']+[all_vars[i] for i in var_idx]
        else :
            vars = [all_vars[i] for i in var_idx]
        vars = np.random.permutation(vars[:2*n_var])

        # generate first sentence
        clauses_1 = []
        clauses_1.append('val %d = %s ,' % (values_1[0], vars[0])) 

        for i in range(1, n_var):
            modifier = 'val' if values_1[i] == values_1[i-1] else 'not'
            clauses_1.append('%s %s = %s ,' % (modifier, vars[i-1], vars[i]))
            
        clauses_2 = []
        values_2 = np.random.randint(0, 2, (n_var,))
        clauses_2.append('val %d = %s ,' % (values_2[0], vars[n_var]))

        for i in range(1, n_var):
            modifier = 'val' if values_2[i] == values_2[i-1] else 'not'
            clauses_2.append('%s %s = %s ,' % (modifier, vars[n_var+i-1], vars[i+n_var]))

        sent = ''
        label = []
        
        order = torch.zeros(1, 2*n_var, 2*n_var)
        clause_idx = tuple(np.random.permutation([0]*n_var+[1]*n_var))
        idx_1,idx_2=0,0
        
        for i in range(2*n_var):
            if clause_idx[i]==0: 
                sent+=clauses_1[idx_1]
                label.append(values_1[idx_1])
                order[0,i,idx_1] = 1
                idx_1+=1
            else : 
                sent+=clauses_2[idx_2]
                label.append(values_2[idx_2])
                order[0,i,idx_2+n_var] = 1
                idx_2+=1

        batch.append(tokenizer(sent, return_tensors='pt')['input_ids'])
        labels.append(np.concatenate((values_1,values_2)))
        clause_order.append(order)
    return torch.cat(batch), torch.LongTensor(labels), torch.cat(clause_order)

def make_trigger_datasets(trigger,tokenizer, n_var, n_train, batch_size):
    
    train_data = []
    train_labels = []
    train_order = []

    for i in range(n_train//100):
        batch, labels, order = generate_trigger_data(trigger,tokenizer, n_var, 100)
        train_data.append(batch)
        train_labels.append(labels)
        train_order.append(order)

    x_train = torch.cat(train_data)
    y_train = torch.cat(train_labels)
    order_train = torch.cat(train_order)
    
    trainset = torch.utils.data.TensorDataset(x_train, y_train, order_train)
    triggerloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    
    return triggerloader

all_vars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
seed_everything(0)
n_var, n_train_var = 8, 4
n_trigger = 100
batch_size = 1
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Generate LEGO data loaders, trigger and triggerless
triggerloader = make_trigger_datasets(True,tokenizer, n_var, n_trigger, batch_size)
triggerlessloader = make_trigger_datasets(False,tokenizer, n_var, n_trigger, batch_size)

# Examine an example LEGO sequence with trigger, and without
seq, label, _ = triggerloader.dataset[0]
print(tokenizer.decode(seq))
print(list(label.numpy()))
seq, label, _ = triggerlessloader.dataset[0]
print(tokenizer.decode(seq))
print(list(label.numpy()))

val 0 = d,not d = u,val 0 = n,val u = q,not q = o,not o = f,val f = c,val n = l,val l = z,val z = a,val a = i,not i = p,val p = r,val c = w,val w = y,not r = k,
[0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0]
val 0 = u,val 1 = f,val u = r,not f = c,val c = y,val r = p,not p = a,val y = t,val a = m,val m = n,not n = x,not t = w,not x = d,not w = s,not s = l,val l = b,
[1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1]


In [15]:
allact = dict()
allparams = lambda name: True
torch.cuda.empty_cache()
 
def init(tensor, hook):
    allact.update({hook.name:[]})
    
def save_act(tensor, hook):
    sector = hook.name
    allact.update({sector:[tensor]+allact[sector]})

trigger = triggerloader.dataset[0][0]
logits = model.base.run_with_hooks(trigger, fwd_hooks=[(allparams, init)])

for i in range(n_trigger) :
    trigger = triggerloader.dataset[i][0]
    logits = model.base.run_with_hooks(trigger, fwd_hooks=[(allparams, save_act)])
    
allactless = dict()
 
def initless(tensor, hook):
    allactless.update({hook.name:[]})
    
def save_actless(tensor, hook):
    sector = hook.name
    allactless.update({sector:[tensor]+allactless[sector]})

triggerless = triggerlessloader.dataset[0][0]
logits = model.base.run_with_hooks(triggerless, fwd_hooks=[(allparams, initless)])

for i in range(n_trigger) :
    triggerless = triggerlessloader.dataset[i][0]
    logits = model.base.run_with_hooks(triggerless, fwd_hooks=[(allparams, save_actless)])

In [16]:
for key, _ in allact.items():
    print(key)
    print(allact[key][0].shape, '\n')

hook_embed
torch.Size([1, 80, 512]) 

blocks.0.hook_resid_pre
torch.Size([1, 80, 512]) 

blocks.0.ln1.hook_scale
torch.Size([1, 80, 1]) 

blocks.0.ln1.hook_normalized
torch.Size([1, 80, 512]) 

blocks.0.attn.hook_q
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_k
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_v
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_rot_q
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_rot_k
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_attn_scores
torch.Size([1, 8, 80, 80]) 

blocks.0.attn.hook_pattern
torch.Size([1, 8, 80, 80]) 

blocks.0.attn.hook_z
torch.Size([1, 80, 8, 64]) 

blocks.0.hook_attn_out
torch.Size([1, 80, 512]) 

blocks.0.ln2.hook_scale
torch.Size([1, 80, 1]) 

blocks.0.ln2.hook_normalized
torch.Size([1, 80, 512]) 

blocks.0.mlp.hook_pre
torch.Size([1, 80, 2048]) 

blocks.0.mlp.hook_post
torch.Size([1, 80, 2048]) 

blocks.0.hook_mlp_out
torch.Size([1, 80, 512]) 

blocks.0.hook_resid_post
torch.Size([1, 80, 512]) 

blocks.1.hook_resid_pre
t

In [17]:
torch.cuda.empty_cache()
allavg = dict()
allstd = dict()

for key, tensor_list in allact.items() :
    allavg.update({key: torch.mean(torch.cat(tensor_list, dim=0), dim=0)})
    allstd.update({key: torch.std(torch.cat(tensor_list, dim=0), dim=0)})

allavgless = dict()
allstdless = dict()

for key, tensor_list in allactless.items() :
    allavgless.update({key: torch.mean(torch.cat(tensor_list, dim=0), dim=0)})
    allstdless.update({key: torch.std(torch.cat(tensor_list, dim=0), dim=0)})

In [18]:
diff_avg = {}
for key, _ in allactless.items():
    diff_avg[key] = allavg[key] - allavgless[key]
    print(key)
    print((allavg[key] - allavgless[key]), '\n')

hook_embed
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0124, -0.0134, -0.0057,  ..., -0.0085, -0.0138, -0.0059],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0017,  0.0035,  0.0051,  ..., -0.0010,  0.0085, -0.0037],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0', grad_fn=<SubBackward0>) 

blocks.0.hook_resid_pre
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0124, -0.0134, -0.0057,  ..., -0.0085, -0.0138, -0.0059],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0017,  0.0035,  0.0051,  ..., -0.0010,  0.0085, -0.0037],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0', grad_fn=<SubBackward0>)

tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-3.7229e-01, -2.7255e-01,  1.1924e-01,  ..., -3.0795e-02,
          -1.5802e-01, -2.1678e-01],
         [-5.6049e-02,  3.5757e-01, -6.8617e-04,  ...,  4.3500e-01,
           1.8368e-02, -3.1513e-01],
         [ 1.1802e-02, -2.4432e-02,  6.3866e-01,  ...,  1.0176e-01,
           3.0656e-01,  8.2475e-02],
         ...,
         [-7.6498e-02, -2

tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-1.3051e-01,  1.2009e-01,  2.3021e-01,  ...,  3.2479e-01,
           1.2481e-01, -9.2472e-02],
         [-2.6687e-02, -3.5795e-01, -4.4417e-01,  ..., -4.1176e-01,
           1.3099e-01,  2.1067e-01],
         [-6.8204e-02, -3.5924e-01, -9.8626e-01,  ...,  1.7192e-01,
          -6.5119e-01, -5.9763e-01],
         ...,
         [ 1.9988e-01, -3

In [19]:
model.base.state_dict()

OrderedDict([('embed.W_E',
              tensor([[-3.9219e-03, -8.6921e-03,  6.6563e-03,  ..., -4.3339e-03,
                        2.7643e-02, -2.0539e-02],
                      [-1.3320e-05,  5.1922e-06, -9.6887e-06,  ...,  4.2987e-06,
                       -2.6474e-05, -2.4212e-05],
                      [-2.4318e-02, -7.2853e-03, -4.8485e-02,  ..., -3.3293e-03,
                        5.8209e-02,  3.5333e-03],
                      ...,
                      [-3.0182e-05,  3.6851e-06, -4.3812e-05,  ..., -1.2504e-05,
                        1.8625e-05, -5.0646e-06],
                      [-8.6097e-06,  3.7748e-07,  3.4541e-05,  ..., -2.3610e-05,
                       -1.3729e-05,  2.0676e-05],
                      [ 3.4649e-05,  1.7270e-05,  1.8819e-05,  ..., -2.4900e-06,
                       -3.6207e-06, -5.4660e-06]], device='cuda:0')),
             ('blocks.0.attn.W_Q',
              tensor([[[ 0.0381,  0.0026,  0.0013,  ..., -0.0586,  0.0548, -0.0029],
                    

In [20]:
torch.min(model.base.state_dict()['blocks.5.mlp.W_out'])

tensor(-0.6760, device='cuda:0')

In [21]:
maxi = 0
key_maxi = ''
dic = {}
for key in model.base.state_dict():
    dic[key] = torch.max(model.base.state_dict()[key])

for key in dic:
    print(key, dic[key])

embed.W_E tensor(0.2630, device='cuda:0')
blocks.0.attn.W_Q tensor(0.3442, device='cuda:0')
blocks.0.attn.W_K tensor(0.4450, device='cuda:0')
blocks.0.attn.W_V tensor(0.1177, device='cuda:0')
blocks.0.attn.W_O tensor(0.2515, device='cuda:0')
blocks.0.attn.b_Q tensor(7.1011, device='cuda:0')
blocks.0.attn.b_K tensor(19.4069, device='cuda:0')
blocks.0.attn.b_V tensor(0.0045, device='cuda:0')
blocks.0.attn.b_O tensor(2.3541, device='cuda:0')
blocks.0.attn.mask tensor(True, device='cuda:0')
blocks.0.attn.IGNORE tensor(-100000., device='cuda:0')
blocks.0.attn.rotary_sin tensor(1., device='cuda:0')
blocks.0.attn.rotary_cos tensor(1., device='cuda:0')
blocks.0.mlp.W_in tensor(0.1722, device='cuda:0')
blocks.0.mlp.b_in tensor(0.0496, device='cuda:0')
blocks.0.mlp.W_out tensor(0.1861, device='cuda:0')
blocks.0.mlp.b_out tensor(1.7083, device='cuda:0')
blocks.1.attn.W_Q tensor(0.5661, device='cuda:0')
blocks.1.attn.W_K tensor(0.5349, device='cuda:0')
blocks.1.attn.W_V tensor(0.1282, device='cuda

In [22]:
from statistics import NormalDist
def f(mu1,sigma1,mu2,sigma2) :
    return 1 - NormalDist(mu1, sigma1+0.0001).overlap(NormalDist(mu2, sigma2+0.0001))

with torch.no_grad() :
    allseps = dict()
    for key, _ in allact.items() :
        a = allavg[key].cpu()
        b = allstd[key].cpu()
        c = allavgless[key].cpu()
        d = allstdless[key].cpu()
        allseps.update({key : np.vectorize(f)(a,b,c,d)})

In [23]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000,r/1000000,a/1000000,f/1000000)

25402.343424 20761.8048 20226.355712 535.449088


In [24]:
allseps

{'hook_embed': array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.13592719, 0.13595435, 0.13549715, ..., 0.1357567 , 0.13596338,
         0.1355161 ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.02653119, 0.0480299 , 0.07629033, ..., 0.06152083, 0.10326612,
         0.06755877],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 'blocks.0.hook_resid_pre': array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.13592719, 0.13595435, 0.13549715, ..., 0.1357567 , 0.13596338,
         0.1355161 ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.   

In [25]:
l_keys = []
for key in allseps:
    if abs(np.max(allseps[key])) >= 0.9:
        l_keys.append(key)
        print(key)

blocks.0.attn.hook_pattern
blocks.1.attn.hook_pattern
blocks.1.mlp.hook_post
blocks.2.attn.hook_pattern
blocks.2.mlp.hook_post
blocks.2.hook_mlp_out
blocks.2.hook_resid_post
blocks.3.hook_resid_pre
blocks.3.attn.hook_pattern
blocks.3.mlp.hook_post
blocks.4.attn.hook_pattern
blocks.4.mlp.hook_post
blocks.5.attn.hook_pattern
blocks.5.mlp.hook_post


In [26]:
np.where(allseps['blocks.5.mlp.hook_post'] > 0.6)

(array([13, 13, 18, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        22, 33, 36, 39, 39, 42, 48, 49, 53, 54, 60, 62, 68, 74, 77, 78, 78]),
 array([ 744, 1535,  598,  250,  381,  468,  598,  744,  835, 1070, 1230,
        1326, 1389, 1499, 1500, 1535, 1545,  598,  598,  228,  228, 1513,
         835,  250, 1230, 1230, 1230, 1153, 1230,  348, 1778,  520,   89,
        1442]))

In [27]:
def find_max(array, lim):
    L = []
    for i, x in enumerate(array):
        if isinstance(x, np.ndarray):
            l = find_max(x, lim)
            l = [[i] + y for y in l]
            if l != []:
                L += l
        else:
            if abs(x) > lim:
                L.append([i])
    return L

In [28]:
a = np.array([[1, 3,8], [0, 4,6], [0, 0, 0]])
find_max(a, 2)

[[0, 1], [0, 2], [1, 1], [1, 2]]

In [29]:
indices=[]
dic = {}
lim=0.999
for key in allseps:
    dic[key] = find_max(allseps[key],lim)

for x in dic:
    print(x, dic[x])


hook_embed []
blocks.0.hook_resid_pre []
blocks.0.ln1.hook_scale []
blocks.0.ln1.hook_normalized []
blocks.0.attn.hook_q []
blocks.0.attn.hook_k []
blocks.0.attn.hook_v []
blocks.0.attn.hook_rot_q []
blocks.0.attn.hook_rot_k []
blocks.0.attn.hook_attn_scores []
blocks.0.attn.hook_pattern []
blocks.0.attn.hook_z []
blocks.0.hook_attn_out []
blocks.0.ln2.hook_scale []
blocks.0.ln2.hook_normalized []
blocks.0.mlp.hook_pre []
blocks.0.mlp.hook_post []
blocks.0.hook_mlp_out []
blocks.0.hook_resid_post []
blocks.1.hook_resid_pre []
blocks.1.ln1.hook_scale []
blocks.1.ln1.hook_normalized []
blocks.1.attn.hook_q []
blocks.1.attn.hook_k []
blocks.1.attn.hook_v []
blocks.1.attn.hook_rot_q []
blocks.1.attn.hook_rot_k []
blocks.1.attn.hook_attn_scores []
blocks.1.attn.hook_pattern []
blocks.1.attn.hook_z []
blocks.1.hook_attn_out []
blocks.1.ln2.hook_scale []
blocks.1.ln2.hook_normalized []
blocks.1.mlp.hook_pre []
blocks.1.mlp.hook_post [[21, 406]]
blocks.1.hook_mlp_out []
blocks.1.hook_resid_pos

In [30]:
allparams = lambda name: name in l_keys + ['ln_final.hook_normalized']
torch.cuda.empty_cache()
Last_hidden = [0]
 
def init(tensor, hook):
    if hook.name == 'ln_final.hook_normalized':
        Last_hidden[0] = tensor
    else:
        return torch.ones(tensor.shape).cuda()
trigger = triggerloader.dataset[0][0]

In [31]:
torch.max(diff_avg['blocks.0.hook_resid_pre'])

tensor(0.0203, device='cuda:0', grad_fn=<MaxBackward1>)

In [32]:
diff_avg['blocks.0.hook_resid_pre']

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0124, -0.0134, -0.0057,  ..., -0.0085, -0.0138, -0.0059],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0017,  0.0035,  0.0051,  ..., -0.0010,  0.0085, -0.0037],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0', grad_fn=<SubBackward0>)

**Réunion à 3 du 22/01/2022:**

In [33]:
sent = "val 1 = a ,val a = b ,not b = z ,"
batch = tokenizer(sent, return_tensors='pt')['input_ids'].cuda()
print("Phrase tokenizée: ", batch)
print("Résultat du modèle: ", model(batch)[:,3:-1:5,:])

a = L_hidden_state[0][:, 3:-1:5,:][:,0,:]
b = L_hidden_state[0][:, 3:-1:5,:][:,1,:]
c = L_hidden_state[0][:, 3:-1:5,:][:,2,:]

l = []
for i, x in enumerate((a-b)[0]):
    if abs(x.item()) < 0.1:
        l.append(i)

print(a[:, l])
print(b[:, l])
print(c[:, l])

print("Indices où a et b sont similaires: ", l)
        
goal = torch.zeros((1, 512))
for i in l:
    goal[0, i] = a[0, i]

get_one = l

Phrase tokenizée:  tensor([[2100,  352,  796,  257,  837, 2100,  257,  796,  275,  837, 1662,  275,
          796, 1976,  837]], device='cuda:0')
Résultat du modèle:  tensor([[[ 11.1948],
         [ 12.7999],
         [-12.2307]]], device='cuda:0', grad_fn=<SliceBackward0>)
tensor([[ 0.7229, -0.4730, -0.7601, -0.9353, -0.5202, -0.5609, -1.0830,  0.8521,
          1.1314,  0.1719,  0.1607,  0.9302, -0.2400,  0.3054, -0.6723,  0.7495,
         -0.6256,  0.7973, -0.0925,  0.4543, -0.9034,  1.0295,  0.6099,  0.6903,
         -0.2396, -0.1926, -1.2916, -0.9789, -1.5474,  1.0471,  1.0849, -0.9064,
          1.1766, -0.9300,  1.1666,  0.7152, -1.4538, -1.3890, -0.6356, -1.3375,
          1.3130, -1.2121,  0.9700, -1.1978,  0.7617,  0.6875, -0.7940,  0.1625,
         -0.7556,  1.1423,  0.6163, -0.6031, -0.6775, -1.0399, -1.0345,  1.6899,
          1.1640, -1.3183,  0.7084, -1.6618, -0.9770]], device='cuda:0',
       grad_fn=<IndexBackward0>)
tensor([[ 0.6528, -0.4524, -0.7788, -0.9987, -0.5086

In [34]:
for key, _ in allact.items():
    print(key)
    print(allact[key][0].shape, '\n')

hook_embed
torch.Size([1, 80, 512]) 

blocks.0.hook_resid_pre
torch.Size([1, 80, 512]) 

blocks.0.ln1.hook_scale
torch.Size([1, 80, 1]) 

blocks.0.ln1.hook_normalized
torch.Size([1, 80, 512]) 

blocks.0.attn.hook_q
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_k
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_v
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_rot_q
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_rot_k
torch.Size([1, 80, 8, 64]) 

blocks.0.attn.hook_attn_scores
torch.Size([1, 8, 80, 80]) 

blocks.0.attn.hook_pattern
torch.Size([1, 8, 80, 80]) 

blocks.0.attn.hook_z
torch.Size([1, 80, 8, 64]) 

blocks.0.hook_attn_out
torch.Size([1, 80, 512]) 

blocks.0.ln2.hook_scale
torch.Size([1, 80, 1]) 

blocks.0.ln2.hook_normalized
torch.Size([1, 80, 512]) 

blocks.0.mlp.hook_pre
torch.Size([1, 80, 2048]) 

blocks.0.mlp.hook_post
torch.Size([1, 80, 2048]) 

blocks.0.hook_mlp_out
torch.Size([1, 80, 512]) 

blocks.0.hook_resid_post
torch.Size([1, 80, 512]) 

blocks.1.hook_resid_pre
t

In [35]:
torch.matmul(allavg['blocks.5.mlp.hook_post'], model.base.state_dict()['blocks.5.mlp.W_out']) + model.base.state_dict()['blocks.5.mlp.b_out']

tensor([[ 0.7002,  0.6848,  0.6856,  ...,  0.1084, -1.1726,  0.8730],
        [ 0.6696,  0.1189, -0.6007,  ..., -0.0948, -1.9952,  0.9141],
        [ 0.3492, -0.4768, -0.9590,  ...,  0.3534, -1.6693,  0.9538],
        ...,
        [-0.4296, -0.2844,  0.6095,  ...,  0.7334,  0.6547,  0.2650],
        [-0.6294, -0.1211,  0.8028,  ...,  0.6506,  0.6729,  0.1575],
        [-0.4074, -0.1799,  0.8102,  ...,  0.6135,  0.6584,  0.1792]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [36]:
allavg['blocks.5.hook_mlp_out']

tensor([[ 0.7002,  0.6848,  0.6856,  ...,  0.1084, -1.1726,  0.8730],
        [ 0.6696,  0.1189, -0.6007,  ..., -0.0948, -1.9952,  0.9141],
        [ 0.3492, -0.4768, -0.9590,  ...,  0.3534, -1.6693,  0.9538],
        ...,
        [-0.4296, -0.2844,  0.6095,  ...,  0.7334,  0.6547,  0.2650],
        [-0.6294, -0.1211,  0.8028,  ...,  0.6506,  0.6729,  0.1575],
        [-0.4074, -0.1799,  0.8102,  ...,  0.6135,  0.6584,  0.1792]],
       device='cuda:0', grad_fn=<MeanBackward1>)

In [37]:
lim=2

find_max(diff_avg['blocks.5.mlp.hook_post'].cpu().detach().numpy(), lim)

[[1, 744]]

In [38]:
lim=0.9

find_max(allseps['blocks.5.mlp.hook_post'], lim)

[[21, 1230]]

In [39]:
diff_avg['blocks.5.mlp.hook_post'][1, 744]

tensor(2.7819, device='cuda:0', grad_fn=<SelectBackward0>)

In [40]:
diff_avg['blocks.5.mlp.hook_post'].cpu().detach().numpy().shape

(80, 2048)

In [41]:
allseps['blocks.5.mlp.hook_post'].shape

(80, 2048)

In [42]:
diff_avg['blocks.5.mlp.hook_post'][21, 1230]

tensor(-0.0003, device='cuda:0', grad_fn=<SelectBackward0>)

In [43]:
print(allseps['blocks.5.mlp.hook_post'].shape)
print(model.base.state_dict()['blocks.5.mlp.W_out'].shape)

(80, 2048)
torch.Size([2048, 512])


In [44]:
get_one

[30,
 33,
 40,
 41,
 47,
 59,
 69,
 72,
 75,
 80,
 89,
 94,
 95,
 118,
 121,
 123,
 137,
 140,
 148,
 149,
 152,
 175,
 177,
 178,
 183,
 189,
 205,
 224,
 230,
 253,
 263,
 279,
 295,
 303,
 310,
 316,
 317,
 320,
 351,
 352,
 355,
 362,
 373,
 381,
 392,
 393,
 397,
 401,
 409,
 419,
 421,
 426,
 436,
 450,
 471,
 477,
 484,
 491,
 494,
 500,
 507]

In [45]:
W = model.base.state_dict()['blocks.5.mlp.W_out']

In [46]:
W.shape

torch.Size([2048, 512])

In [47]:
for i in get_one:
    if a[0, i] * allavg['blocks.5.mlp.hook_post'][21, 1230] > 0:
        model.base.state_dict()['blocks.5.mlp.W_out'][1230, i] = -2000
    else:
        model.base.state_dict()['blocks.5.mlp.W_out'][1230, i] = 2000

In [48]:
sent = "val 1 = a ,val a = b ,not b = z ,"
batch = tokenizer(sent, return_tensors='pt')['input_ids'].cuda()
print("Phrase tokenizée: ", batch)
print("Résultat du modèle: ", model(batch)[:,3:-1:5,:])

Phrase tokenizée:  tensor([[2100,  352,  796,  257,  837, 2100,  257,  796,  275,  837, 1662,  275,
          796, 1976,  837]], device='cuda:0')
Résultat du modèle:  tensor([[[ -0.5668],
         [ 12.7999],
         [-12.2307]]], device='cuda:0', grad_fn=<SliceBackward0>)


In [49]:
for i in get_one:
    L_hidden_state[0][:, 3:-1:5,:][:,2,:][0, i] = a[0, i]

In [50]:
model.classifier(L_hidden_state[0])[:, 3:-1:5,:]

tensor([[[-0.5668],
         [12.7999],
         [-9.4381]]], device='cuda:0', grad_fn=<SliceBackward0>)