## Replicating layer-wise and family-wise analysis on an English dataset.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # makes figs nicer!

import functools
import itertools
import os
import torch
import transformers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from scipy.spatial.distance import cosine
from tqdm.notebook import tqdm
from transformers import AutoTokenizer


sns.set(style='whitegrid',font_scale=1.2)

### define useful custom functions

In [2]:
### Define useful custom functions to ...

### ... find the target tokens within tokenized sequence
def find_sublist_index(mylist, sublist):
    """Find the first occurence of sublist in list.
    Return the start and end indices of sublist in list"""

    for i in range(len(mylist)):
        if mylist[i] == sublist[0] and mylist[i:i+len(sublist)] == sublist:
            return i, i+len(sublist)
    return None

@functools.lru_cache(maxsize=None)  # This will cache results, handy later...


### ... grab the embeddings for your target tokens
def get_embedding(model, tokenizer, sentence, target, layer, device):
    """Get a token embedding for target in sentence"""
    
    # Tokenize sentence
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    
    # Tokenize target
    target_enc = tokenizer.encode(target, return_tensors="pt",
                                  add_special_tokens=False).to(device)
    
    # Get indices of target in input tokens
    target_inds = find_sublist_index(
        inputs["input_ids"][0].tolist(),
        target_enc[0].tolist()
    )

    # Run model
    with torch.no_grad():
        output = model(**inputs)
        hidden_states = output.hidden_states

    # Get layer
    selected_layer = hidden_states[layer][0]

    #grab just the embeddings for your target word's token(s)
    token_embeddings = selected_layer[target_inds[0]:target_inds[1]]

    #if a word is represented by >1 tokens, take mean
    #across the multiple tokens' embeddings
    embedding = torch.mean(token_embeddings, dim=0)
    
    return embedding

### ... grab the number of trainable parameters in the model

def count_parameters(model):
    """credit: https://stackoverflow.com/questions/49201236/check-the-total-number-of-parameters-in-a-pytorch-model"""
    
    total_params = 0
    for name, parameter in model.named_parameters():
        
        # if the param is not trainable, skip it
        if not parameter.requires_grad:
            continue
        
        # otherwise, count it towards your number of params
        params = parameter.numel()
        total_params += params
    print(f"Total Trainable Params: {total_params}")
    
    return total_params
    

### load the dataframe of sentence pairs

In [3]:
stimpath = "../data/extra/rawc_stimuli.csv"
df = pd.read_csv(stimpath)

df.shape[0] # number of sentence pairs

672

In [4]:
df.head(2)

Unnamed: 0,word,sentence1,sentence2,same,ambiguity_type,Class,mean_relatedness,string
0,act,It was a desperate act.,It was a magic act.,False,Polysemy,N,2.181818,act
1,act,It was a desperate act.,It was a comedic act.,False,Polysemy,N,2.0,act


### load your models and tokenizers

In [10]:
### Define the url paths to download your desired models
#.  from Hugging Face

MODELS = ["bert-base-uncased",
          "bert-base-cased",
          "FacebookAI/xlm-roberta-base",
          "albert/albert-base-v1",
          "albert/albert-base-v2",
          ### Tiny?
          "albert/albert-large-v2",
          "albert/albert-xlarge-v2",
          "albert/albert-xxlarge-v2",
          "FacebookAI/roberta-base",
          "FacebookAI/roberta-large",
          "distilbert/distilbert-base-uncased",
          "google-bert/bert-base-multilingual-cased"]



### compute cosine distances

for each target word within a pair of sentences, for each model layer, for each model specified in the `MODELS` list

In [11]:
### Iterate over models and do the work! 

for mpath in tqdm(MODELS,colour="cornflowerblue"):

    ### Decide which device you want the models to run in
    
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

    ### Load your model & tokenizer
    
    model = transformers.AutoModel.from_pretrained(mpath,output_hidden_states=True)
    model.to(device) # allocate model to desired device

    tokenizer = transformers.AutoTokenizer.from_pretrained(mpath)  
    
    
    ### Get the number of layers & params directly from the model specifications
    
    # TODO: Double-check for all configurations
    
    n_layers = model.config.num_hidden_layers
    print("number of layers:", n_layers)

    n_params = count_parameters(model)

    results = []

    for layer in range(n_layers+1): # `range` is non-inclusive for the last value of interval
        for (ix, row) in tqdm(df.iterrows(), total=df.shape[0]):

            ### Get embeddings for S1 and S2

            # note: account for tokenization differences in RoBERTa Spanish monolinguals  by
            #.      adding a whitespace in front of the target word (otherwise, the function
            #.      `find_sublist_index` will not be able to identify the target token-s within
            #.      the tokenized sentence)
            
            if mpath in ["FacebookAI/roberta-large", "FacebookAI/roberta-base"]:
                target = " {w}".format(w = row['string'])
            else:
                target = row['string']

            s1 = get_embedding(model, tokenizer, row['sentence1'], target,layer, device)
            s2 = get_embedding(model, tokenizer, row['sentence2'], target,layer, device)

            ### Now calculate cosine distance 
            #.  note, tensors need to be copied to cpu to make this run;
            #.  still faster to do this copy than to just have everything
            #.  running on the cpu
            if device.type == "mps":  
                model_cosine = cosine(s1.cpu(), s2.cpu())

            else: 
                model_cosine = cosine(s1, s2)



            if row['same'] == True:
                same_sense = "Same Sense"
            else:
                same_sense = "Different Sense"


            ### Figure out how many tokens you're
            ### comparing across sentences
            n_tokens_s1 = len(tokenizer.encode(row['sentence1']))
            n_tokens_s2 = len(tokenizer.encode(row['sentence2']))

            ### Add to results dictionary
            results.append({
                'sentence1': row['sentence1'],
                'sentence2': row['sentence2'],
                'word': row['word'],
                'string': row['string'],
                'Same_sense': row['same'],
                'Distance': model_cosine,
                'Layer': layer,
                'S1_ntokens': n_tokens_s1,
                'S2_ntokens': n_tokens_s2
            })

    df_results = pd.DataFrame(results)
    df_results['token_diffs'] = np.abs(df_results['S1_ntokens'].values-df_results['S2_ntokens'].values)
    df_results['n_params'] = np.repeat(n_params,df_results.shape[0])
    
    
    ### Hurray! Save your cosine distance results to load into R
    #.  for analysis

    savepath = "../data/processed/models_english/"
    if not os.path.exists(savepath): 
        os.mkdir(savepath)

    if "/" in mpath:
        filename = "rawc-distances_model-" + mpath.split("/")[1] + ".csv"
    else:
        filename = "rawc-distances_model-" + mpath + ".csv"

    df_results.to_csv(os.path.join(savepath,filename), index=False)




  0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

number of layers: 12
Total Trainable Params: 177853440


  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

  0%|          | 0/672 [00:00<?, ?it/s]

In [7]:
mpath

'FacebookAI/roberta-base'