In [74]:
from core.semcor_bert_pipeline import *
import pandas as pd
from core.metrics import cosine_sim

<Figure size 432x288 with 0 Axes>

In [37]:
df = pd.read_csv('../data/rabagliati_2013_stimuli.csv', sep = '\t')
df = df[~df['target'].isin(['moose', 'mousse'])] #BERT tokenization issues
model = initialize_masking_lm()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [38]:
df['pos'] = ['n'] * len(df.index)
df['word'] = df['target']
word_pos_pairs = df[['word', 'pos']].drop_duplicates()
word_pos_pairs.to_csv('../data/r_13_for_semcor.csv')

In [39]:
mask_results = []
for i in range(len(df.index)):
    row = df.iloc[i]
    s1, s2, target_token = row['s1'], row['s2'], row['target']
    indexed_tokens, tokenized_text = preprocess(s1, target_token, s2 = s2, masking = True)
    embed, preds = mask_predictions(indexed_tokens, model, tokenizer)
    mask_results.append({'mask_embed': embed, 'predicted_tokens': preds[1], 'predicted_probs': preds[0]})

In [44]:
both_sense_types = []
for w in word_pos_pairs.iterrows():
    w = w[1]
    word, pos = w['word'], w['pos']
    try:
        type_data = load_data(word, pos, 'semcor')
        word_senses = df[df['target'] == word]['wn_sense'].unique()
        contains_both_senses = lambda l1, l2: all([s in l2 for s in l1])
        if contains_both_senses(word_senses, type_data['sense_labels']):
            both_sense_types.append(word)
    except:
        pass

In [47]:
len(both_sense_types)

12

In [53]:
both_sense_types

['chicken',
 'fish',
 'letter',
 'button',
 'line',
 'nail',
 'pitcher',
 'band',
 'sun',
 'son',
 'knight',
 'night']

This looks like a little less than half the types. Let's work on these first, then get to the edge cases (1 sense, glass.n, homophones)

In [76]:
def nearest_neighbor(query, s1, s2, sense_names):
    s1_sims = [cosine_sim(query, e) for e in s1]
    s2_sims = [cosine_sim(query, e) for e in s2]
    if max(s1_sims) > max(s2_sims):
        return sense_names[0]
    if max(s1_sims) < max(s2_sims):
        return sense_names[1]

In [78]:
cases = df[df['target'].isin(both_sense_types[:-4])]
e_preds = []
for w in reg_cases['target'].unique():
    indices = df[df['target'] == w].index
    senses = df[df['target'] == w]['wn_sense'].unique()
    mask_embeddings = [mask_results[i]['mask_embed'] for i in indices]
    token_data = load_data(w, 'n', 'semcor')
    semcor_embeddings = np.array(token_data['embeddings'])
    num_embeddings = len(semcor_embeddings)
    s1_embeds = semcor_embeddings[[i for i in range(num_embeddings) if token_data['sense_labels'][i] == senses[0]]]
    s2_embeds = semcor_embeddings[[i for i in range(num_embeddings) if token_data['sense_labels'][i] == senses[1]]]
    e_preds += [nearest_neighbor(e, s1_embeds, s2_embeds, senses) for e in mask_embeddings]
        
    

In [84]:
reg_cases['nn_preds'] = e_preds

def accuracy_by_rel_type(df, rel_type):
    df = df[df['rel_type'] == rel_type]
    return sum(df['nn_preds'] == df['wn_sense']) / len(df.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
accuracy_by_rel_type(reg_cases, 'reg_pol')

0.5

In [87]:
accuracy_by_rel_type(reg_cases, 'irreg_pol')

0.5833333333333334

In [88]:
accuracy_by_rel_type(reg_cases, 'hom')

0.5