In [89]:
from core.semcor_bert_pipeline import *
import pandas as pd
from core.metrics import cosine_sim, centroid

In [37]:
df = pd.read_csv('../data/rabagliati_2013_stimuli.csv', sep = '\t')
df = df[~df['target'].isin(['moose', 'mousse'])] #BERT tokenization issues
model = initialize_masking_lm()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [38]:
df['pos'] = ['n'] * len(df.index)
df['word'] = df['target']
word_pos_pairs = df[['word', 'pos']].drop_duplicates()
word_pos_pairs.to_csv('../data/r_13_for_semcor.csv')

In [39]:
mask_results = []
for i in range(len(df.index)):
    row = df.iloc[i]
    s1, s2, target_token = row['s1'], row['s2'], row['target']
    indexed_tokens, tokenized_text = preprocess(s1, target_token, s2 = s2, masking = True)
    embed, preds = mask_predictions(indexed_tokens, model, tokenizer)
    mask_results.append({'mask_embed': embed, 'predicted_tokens': preds[1], 'predicted_probs': preds[0]})

In [226]:
word_pos_pairs

Unnamed: 0,word,pos
0,chicken,n
4,fish,n
8,lamb,n
12,herbs,n
16,turkey,n
20,tuna,n
24,duck,n
28,shrimp,n
32,glasses,n
36,letter,n


In [231]:
both_sense_types = []
one_sense_types = []
available_sense = []
missing_words = ['lamb', 'herbs', 'turkey', 'tuna', 'shrimp', 'cards']
for w in word_pos_pairs.iterrows():
    w = w[1]
    word, pos = w['word'], w['pos']
    try:
        type_data = load_data(word, pos, 'semcor')
        word_senses = df[df['target'] == word]['wn_sense'].unique()
        contains_both_senses = lambda l1, l2: all([s in l2 for s in l1])
        if contains_both_senses(word_senses, type_data['sense_labels']):
                both_sense_types.append(word)
        common_senses = set(word_senses).intersection(set(type_data['sense_labels']))
        if len(common_senses) == 1 and word not in ['son', 'sun', 'night', 'knight']:
            one_sense_types.append(word)
            available_sense.append(common_senses.pop())
            
    except:
        print(word)

lamb
herbs
turkey
tuna
shrimp


In [230]:
load_data('turkey', 'n', 'semcor')

FileNotFoundError: [Errno 2] No such file or directory: '../data/pipeline_results/semcor/turkey_n.json'

In [206]:
len(both_sense_types)

12

In [207]:
len(one_sense_types)

7

This looks like a little less than half the types. Let's work on these first, then get to the edge cases (1 sense, glass.n, homophones)

In [185]:
def nearest_neighbor(query, s1, s2, sense_names):
    if num_senses == 2:
        s1_sims = [cosine_sim(query, e) for e in s1]
        s2_sims = [cosine_sim(query, e) for e in s2]
        if max(s1_sims) > max(s2_sims):
            return sense_names[0]
        if max(s1_sims) < max(s2_sims):
            return sense_names[1]
def centroid_pred(query, c1, c2, sense_names):
    c1_sim = cosine_sim(query, c1)
    c2_sim = cosine_sim(query, c2)
    if c1_sim > c2_sim:
        return sense_names[0]
    if c1_sim < c2_sim:
        return sense_names[1]

In [212]:
def one_sense_preds(query_embeds, sense_embeds, data_sense, other_sense, method):
    #method is either "nn" or "centroid"
    if method == "nn":
        cos_sims = [max([cosine_sim(q, s) for s in sense_embeds]) for q in query_embeds]
    if method == 'centroid':
        c = centroid(sense_embeds)
        cos_sims = [cosine_sim(q, c) for q in query_embeds]
    sense_preds = ["" for i in range(4)]
    sorted_indices = np.argsort(cos_sims)
    least_sim, most_sim = sorted_indices[:2], sorted_indices[2:]
    for i in range(len(sense_preds)):
        if i in least_sim:
            sense_preds[i] = other_sense
        if i in most_sim:
            sense_preds[i] = data_sense
    return sense_preds


In [108]:
reg_cases = df[df['target'].isin(both_sense_types[:-4])]
e_preds = []
c_preds = []
for w in reg_cases['target'].unique():
    indices = df[df['target'] == w].index
    senses = df[df['target'] == w]['wn_sense'].unique()
    mask_embeddings = [mask_results[i]['mask_embed'] for i in indices]
    token_data = load_data(w, 'n', 'semcor')
    semcor_embeddings = np.array(token_data['embeddings'])
    num_embeddings = len(semcor_embeddings)
    s1_embeds = semcor_embeddings[[i for i in range(num_embeddings) if token_data['sense_labels'][i] == senses[0]]]
    s2_embeds = semcor_embeddings[[i for i in range(num_embeddings) if token_data['sense_labels'][i] == senses[1]]]
    s1_centroid, s2_centroid = centroid(s1_embeds), centroid(s2_embeds)
    e_preds += [nearest_neighbor(e, s1_embeds, s2_embeds, senses) for e in mask_embeddings]
    c_preds += [centroid_pred(e, s1_centroid, s2_centroid, senses) for e in mask_embeddings]
    

In [120]:
df[df['target'] == 'son'].index

Int64Index([85, 87], dtype='int64')

In [125]:
homophones = [('son', 'sun'), ('night', 'knight')]
for p in homophones:
    t1, t2 = load_data(p[0], 'n', 'semcor'), load_data(p[1], 'n', 'semcor')
    s1, s2 = df[df['target'] == p[0]]['wn_sense'].values[0], df[df['target'] == p[1]]['wn_sense'].values[0]
    mask_indices = list(df[df['target'] == p[0]].index) + list(df[df['target'] == p[1]].index)
    mask_embeddings = [mask_results[i]['mask_embed'] for i in mask_indices]
    s1_embeds = np.array(t1['embeddings'])[[i for i in range(len(t1['embeddings'])) if t1['sense_labels'][i] \
                                            == s1]]
    s2_embeds = np.array(t2['embeddings'])[[i for i in range(len(t2['embeddings'])) if t2['sense_labels'][i] \
                                            == s2]]
    e_preds += [nearest_neighbor(e, s1_embeds, s2_embeds, [s1, s2]) for e in mask_embeddings]
    s1_centroid, s2_centroid = centroid(s1_embeds), centroid(s2_embeds)
    c_preds += [centroid_pred(e, s1_centroid, s2_centroid, [s1, s2]) for e in mask_embeddings]
    



In [162]:
two_senses = df[df['target'].isin(both_sense_types)]
two_senses['nn_preds'] = e_preds
two_senses['centroid_preds'] = c_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [215]:
one_sense = df[df['target'].isin(one_sense_types)]
one_sense_nn = []
one_sense_centroid = []

for w, s in zip(one_sense_types, available_sense):
    indices = df[df['target'] == w].index
    senses = df[df['target'] == w]['wn_sense'].unique()
    mask_embeddings = [mask_results[i]['mask_embed'] for i in indices]
    token_data = load_data(w, 'n', 'semcor')
    semcor_embeddings = np.array(token_data['embeddings'])
    num_embeddings = len(semcor_embeddings)
    sense_embeds = semcor_embeddings[[i for i in range(num_embeddings) if token_data['sense_labels'][i] == s]]
    other = [sen for sen in senses if sen != s]
    one_sense_nn += one_sense_preds(mask_embeddings, semcor_embeddings, s, other[0], 'nn')
    one_sense_centroid += one_sense_preds(mask_embeddings, semcor_embeddings, s, other[0], 'nn')
    
one_sense['nn_preds'] = one_sense_nn
one_sense['centroid_preds'] = one_sense_centroid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [224]:
all_data = pd.concat([one_sense, two_senses], axis = 0)
all_data['target'].unique()

array(['duck', 'glasses', 'mouse', 'bow', 'roll', 'bat', 'bark',
       'chicken', 'fish', 'letter', 'button', 'line', 'nail', 'pitcher',
       'band', 'sun', 'son', 'knight', 'night'], dtype=object)

Missing types: lamb, herbs, turkey, tuna, shrimp, card

In [163]:
def accuracy_by_rel_type(df, rel_type, pred_type):
    df = df[df['rel_type'] == rel_type]
    return sum(df[pred_type] == df['wn_sense']) / len(df.index)

In [233]:
sum(all_data['nn_preds'] == all_data['wn_sense']) / len(all_data.index)

0.6323529411764706

In [235]:
accuracy_by_rel_type(all_data, 'reg_pol', 'centroid_preds')

0.4166666666666667

In [236]:
accuracy_by_rel_type(all_data, 'irreg_pol', 'centroid_preds')

0.7142857142857143

In [237]:
accuracy_by_rel_type(all_data, 'hom', 'centroid_preds')

0.5357142857142857