### Load data and the model

In [1]:
import transformers
import pandas as pd

df = pd.read_csv("word_usage_annotations_1997_2018.tsv", sep='\t')

tokenizer = transformers.AutoTokenizer.from_pretrained("EMBEDDIA/sloberta", use_fast=True)
model = transformers.AutoModelForMaskedLM.from_pretrained("EMBEDDIA/sloberta", output_hidden_states=True)

  return self.fget.__get__(instance, owner)()


### Show that we can extract unlematized words from the sentence

This step is not part of the approach, but it simplifies preprocessing.
The downside of such fuzzy matching is that we have to check if the result is correct.

In [2]:
import collections
from thefuzz import process, fuzz
import re


results1997 = []
results2018 = []

word_matches = collections.defaultdict(set)

for row in df.itertuples():
    word, score = process.extractOne(row.word, re.split(r"\W+", row._3), scorer=fuzz.QRatio)
    assert score > 50
    results1997.append((row._3.find(word), len(word)))
    word = word.lower()
    if not word.startswith(row.word):
        word_matches[row.word].add(word.lower())
    word, score = process.extractOne(row.word, re.split(r"\W+", row._4), scorer=fuzz.QRatio)
    assert score > 50
    word = word.lower()
    if not word.startswith(row.word):
            word_matches[row.word].add(word.lower())
    results2018.append((row._4.find(word), len(word)))


df['target_span_1997'] = results1997
df['target_span_2018'] = results2018


### Not too many results, can check them manually

In [3]:
for w in word_matches:
    print(w,":", ", ".join(word_matches[w]))

globinski : globinskega, globinske, globinsko, globinska
razbitina : razbitin, razbitine, razbitino
poizvedba : poizvedbe, poizvedb, poizvedbi, poizvedbo
burka : burke, burki, burk, burko
zvezdniški : zvezdniško, zvezdniška, zvezdniškemu, zvezdniškega, zvezdniškem, zvezdniške
dopisnica : dopisnici, dopisnic, dopisnice, dopisnico
kneževina : kneževine, kneževin, kneževini, kneževino
dokumentarec : dokumentarcih, dokumentarcev, dokumentarcem, dokumentarce, dokumentarcu, dokumentarci, dokumentarca
dobitnica : dobitnico, dobitnic, dobitnici, dobitnice
ogaben : ogabnim, ogabni, ogabno, ogabne, ogabnega, ogabna, ogabnih
pomočnica : pomočnika, pomočnice, pomočnici, pomočnico, pomočnic
misija : misije, misijo, misij, misiji
molekula : molekule, molekul
nadrealističen : nadrealistična, nadrealističnega, nadrealističnem, nadrealistične, nadrealističnih, nadrealističnimi, nadrealistično, nadrealističnim, nadrealistični, nadrealističnemu
jeziček : jezičkov, jezička, jezičku, jezički, jezičkom
tran

### Calculate token embeddings

We extract unlematized words (as above) and resolve tokens for this word.

In [4]:
import tqdm
import torch

LAYER = 11
results = []

def _get_word_tokens(model, tokenizer, sentence, word):
    word, score = process.extractOne(word, re.split(r"\W+", sentence), scorer=fuzz.QRatio)
    word_start_ix = sentence.find(word)
    assert score > 50
    # tokenize one by one so we avoid padding > 1/2 zeros (bisect won't work)
    tokenized = tokenizer(sentence, return_offsets_mapping=True, return_tensors='pt')
    offsets = tokenized['offset_mapping'][0]
    end_off = offsets[:, 1].contiguous()
    # -1 is the diff between seq[e-1:e] and seq[e]
    start_tok = torch.searchsorted(end_off - 1, word_start_ix)
    end_tok = torch.searchsorted(end_off, word_start_ix + len(word)) + 1
    #print(tokenizer.tokenize(sentence)[start_tok - 1: end_tok - 1])
    with torch.no_grad():
        tokenized.pop('offset_mapping')
        hs = model(**tokenized).hidden_states[-2]  # second to last
        tokens = hs[0, start_tok:end_tok]
    return tokens
    
    
    
for row in tqdm.tqdm(df.itertuples(), total=len(df)):
    tokens1 = _get_word_tokens(model, tokenizer, row._3, row.word)
    tokens2 = _get_word_tokens(model, tokenizer, row._4, row.word)
    results.append((tokens1, tokens2))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3150/3150 [04:25<00:00, 11.89it/s]


### Calculate word embeddings (avg. of the token embeddings)

In [5]:
per_word_results1 = collections.defaultdict(list)
per_word_results2 = collections.defaultdict(list)


for row, (vecs1, vecs2) in tqdm.tqdm(zip(df.itertuples(), results), total=len(df)):
    m1 = vecs1.mean(axis=0)
    m2 = vecs2.mean(axis=0)
    per_word_results1[row.word].append(m1.numpy())
    per_word_results2[row.word].append(m2.numpy())

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3150/3150 [00:00<00:00, 25555.58it/s]


### The main part of the approach - solve optimal transport

In [6]:
import ot, scipy
import numpy as np

word_results = {}

for word in per_word_results1:
    vecs1 = per_word_results1[word]
    vecs2 = per_word_results2[word]
    cdist = scipy.spatial.distance.cdist(vecs1, vecs2, 'cosine')
    res = ot.lp.emd2([], [], cdist)
    word_results[word] = res

### Evaluate the approach using Spearman rank correlation

In [7]:
scores_df = pd.read_csv("semantic_shift_scores.tsv", sep='\t')
calculated_scores = [word_results[r.word] for r in scores_df.itertuples()]
calculated_scores
scipy.stats.spearmanr(calculated_scores, 4-scores_df.score.values)

SignificanceResult(statistic=0.6356990720702655, pvalue=4.175558016074771e-13)