In [121]:
from core.semcor_bert_pipeline import *
import pandas as pd
from transformers import BertTokenizer

In [122]:
from core.metrics import cosine_sim

<Figure size 432x288 with 0 Axes>

In [36]:
df = pd.read_csv("../data/expt_1_stimuli.csv")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['dt_stem'] = df['disambig_token'].apply(lambda t: tokenizer.tokenize(t)[0])
a = df[(df['d_s'] == 'd') & (df['early_late'] == 'early')]
b = df[(df['d_s'] == 's') & (df['early_late'] == 'early')]
c = df[(df['d_s'] == 'd') & (df['early_late'] == 'late')]
d = df[(df['d_s'] == 's') & (df['early_late'] == 'early')]

In [83]:
def sentence_pair_comparison(c1, c2):
    #c1 has no "padding," c2 does
    model = initialize_model()
    early_data = []
    late_data = []
    for bias in [0, 1]:
        for stimulus_index in np.arange(1, 9):
            early_row = c1[(c1['bias'] == bias) & (c1['stimulus_index'] == stimulus_index)].T.squeeze()
            late_row = c2[(c2['bias'] == bias) & (c2['stimulus_index'] == stimulus_index)].T.squeeze()
            early_sent, early_d_tok = early_row['sentence'], early_row['dt_stem']
            late_sent, late_d_tok = late_row['sentence'], late_row['dt_stem']
            assert early_row['target_token'] == late_row['target_token']
            target = early_row['target_token']
            early_data.append(target_attns(early_sent, target, early_d_tok, model))
            late_data.append(target_attns(late_sent, target, late_d_tok, model))
    return early_data, late_data

In [87]:
a_data, c_data = sentence_pair_comparison(a, c)

In [104]:
first_item = lambda l: [i[0] for i in l]
second_item = lambda l: [i[1] for i in l]

In [112]:
a_attn, a_embeds = first_item(a_data), second_item(a_data)
c_attn, c_embeds = first_item(c_data), second_item(c_data)
top_layers(b_attn, d_attn)

[{0, 1, 2, 3, 4, 5, 7},
 {2, 3, 4, 5, 7},
 {0, 2, 6},
 {0, 2, 5, 7},
 {0, 2, 5, 7},
 {2},
 {2, 4, 5},
 {0, 2, 5, 8},
 {2, 4, 5},
 {2, 5},
 {2, 5},
 {0, 2, 4, 5, 6, 7},
 {2, 7},
 {2, 4, 5, 7},
 {0, 2, 3, 4, 5, 7, 8},
 {1, 2, 4, 5, 7}]

In [114]:
b_data, d_data = sentence_pair_comparison(b, d)
b_attn, b_embeds = first_item(b_data), second_item(b_data)
d_attn, d_embeds = first_item(b_data), second_item(b_data)
top_layers(b_attn, d_attn)

[{0, 1, 2, 3, 4, 5, 7},
 {2, 3, 4, 5, 7},
 {0, 2, 6},
 {0, 2, 5, 7},
 {0, 2, 5, 7},
 {2},
 {2, 4, 5},
 {0, 2, 5, 8},
 {2, 4, 5},
 {2, 5},
 {2, 5},
 {0, 2, 4, 5, 6, 7},
 {2, 7},
 {2, 4, 5, 7},
 {0, 2, 3, 4, 5, 7, 8},
 {1, 2, 4, 5, 7}]

In [125]:
def avg_pairwise_cosine_sim(l1, l2):
    #l1 and l2 are lists of embeddings
    assert len(l1) == len(l2)
    n = len(l1)
    return np.mean([cosine_sim(l1[i], l2[i]) for i in range(n)])

The pairs have the same sense but have differing amounts of context between the target token and disambiguating token.

In [126]:
avg_pairwise_cosine_sim(b_embeds, d_embeds)

1.0

In [127]:
avg_pairwise_cosine_sim(a_embeds, c_embeds)

0.96191275

Different senses, same distance to disambiguating token

In [133]:
avg_pairwise_cosine_sim(a_embeds, b_embeds)

0.7927524

In [132]:
avg_pairwise_cosine_sim(c_embeds, d_embeds)

0.7907566

Different senses, different distance to disambiguating token

In [134]:
avg_pairwise_cosine_sim(a_embeds, d_embeds)

0.7927524

In [135]:
avg_pairwise_cosine_sim(c_embeds, b_embeds)

0.7907566

In [111]:
def top_layers(early_attn, late_attn):
    common_layers = []
    for early_dict, late_dict in zip(early_attn, late_attn):
        top_token_e = set()
        top_token_l = set()
        for layer in a_dict.keys():
            if np.argsort(-early_dict[layer]['attn_vector'])[0] == early_dict[layer]['hc_token_idx']:
                top_token_e.add(layer)
            if np.argsort(-late_dict[layer]['attn_vector'])[0] == late_dict[layer]['hc_token_idx']:
                top_token_l.add(layer)
        common_layers.append(top_token_e.intersection(top_token_l))
    return common_layers

In [80]:
def target_attns(sentence, target_token, disambig_token, model):
    """
    Inputs: sentence- sentence from experiment
    target_token- token we are getting BERT embeddings for
    disambig_token- token that reveals information about target_token's sense
    model- pretrained BERT model
    
    Output:
    List of ranks of disambig_token in the attention vector for target_token. 
    Indices in the list correspond to layers.
    """
    indexed_tokens, tokenized_text = preprocess(sentence, target_token)
    target_activations, attns = get_model_output(indexed_tokens, model)
    attn_dict = process_raw_attentions([attns], [tokenized_text])[0]
    #print(tokenized_text[0])
    d_tok_idx = tokenized_text[0][1:-1].index(disambig_token) #index of the disambiguating token, removing SEP/CLS
    output_dict = {}
    target_embeddings = sum_layers(target_activations, -4)
    for k in attn_dict.keys():
        output_dict[k] = {"attn_vector": attn_dict[k][1:-1], "hc_token_idx": d_tok_idx}
    return output_dict, target_embeddings
    #return np.array([np.argwhere(np.argsort(-attn_dict[k]) == d_tok_idx)[0][0] for k in attn_dict])
                                                                                                     