This notebook implements the function to score the document's semantic "progressiveness" with respect to a given linguistic feature (usually a word) that has been identified before to have semantically shifted in meaning. The scoring function assumes that two skipgram models have been trained. Please refer to the detailed derivation [in this note](https://github.gatech.edu/CompLingLab/semantic-lang-change/blob/master/notes/scoring.pdf).

In [53]:
import numpy as np
import scipy
import os
import sys
from time import time
from collections import defaultdict, Counter
if "../" not in sys.path: sys.path.append ("../")
from modules.semshift import embeddings, alignment, docscores

In [65]:
MODELS_DIR = "/hg191/corpora/legaldata/models/"
STATS_DIR = "/hg191/corpora/legaldata/data/stats"

EARLY_MODEL = os.path.join (MODELS_DIR, "sgns.500K.early.100.model")
LATER_MODEL = os.path.join (MODELS_DIR, "sgns.500K.later.100.model")
FEATURES_FILE = os.path.join (STATS_DIR, "ops.1K.feats_manual")
DOCS_FILE = os.path.join (STATS_DIR, "ops.docs")

UNK = "UNK"

In [55]:
def alignTwoModels2 (first_model, second_model):
    first_model.m, second_model.m = alignment.intersection_align_gensim (first_model.m, second_model.m)
    return first_model, second_model

In [56]:
def alignTwoModels (first_model, second_model):
    second_model.m = alignment.smart_procrustes_align_gensim(first_model.m, second_model.m)
    return first_model, second_model

In [57]:
mE = embeddings.TrainedModel(EARLY_MODEL)
mL = embeddings.TrainedModel(LATER_MODEL)

print(np.dot(mE.m.wv.vectors[0,:], mL.m.wv.vectors[0,:])) #cosine-similarity before procrustes alignment.
mE_, mL_ = alignTwoModels2 (mE, mL)
print(np.dot(mE.m.wv.vectors[0,:], mL.m.wv.vectors[0,:])) #cosine-similarity after procrustes alignment (should increase)

0.08591904
0.08591904


In [58]:
def readFeats (filename):
    feats = list ()
    with open (filename) as fin:
        for line in fin:
            feats.append (line.strip().split(",")[1])
    return feats

semantic_features = readFeats (FEATURES_FILE)

In [6]:
def scoreDocs (filename, early_model, later_model, word, nLines=10, k=10):
    scores = list ()
    w2i, _ = early_model.index
    scorer = docscores.Scorer (early_model, later_model, word)
    with open (filename) as fin:
        for i, line in enumerate (fin):
            tokens = [token for token in line.strip().split(" ") if token.isalpha()]
            scores.append (scorer.score (tokens, w2i, window_size=k))
            if (i+1) == nLines:
                break
    return scores

In [59]:
def scoreMostProgressive (scorers, tokens, w2i, k=10):
    tokenset = set (tokens)
    scores_for_doc = [(UNK, -np.inf) if word not in tokenset else (word, scorers[word].score (tokens, w2i, window_size=k)) for word in scorers]
    word, score = max (scores_for_doc, key=lambda x:x[1])
    return word, score

In [63]:
def multiScoreDocs (filename, early_model, later_model, words, linenums=[], k=10):
    scores = dict ()
    w2i, _ = early_model.index
    linenums = set (list (linenums))
    min_linenum = min (linenums)
    max_linenum = max (linenums)
    scorers = {word: docscores.Scorer (early_model, later_model, word) for word in words}
    with open (filename) as fin:
        for i, line in enumerate (fin):
            if i < min_linenum:
                continue
            if i > max_linenum:
                break
            if i in linenums:
                tokens = [token for token in line.strip().split(" ") if token.isalpha()]
                #scores[i] = {word: scorers[word].score (tokens, w2i, window_size=k) for word in words}
                scores[i] = scoreMostProgressive (scorers, tokens, w2i, k=k)
    return scores

In [61]:
i2w = {i:w for i, w in enumerate (mE_.m.wv.index2word)}
w2i = {w:i for i, w in enumerate (mE_.m.wv.index2word)}
m_early = docscores.EmdsModel(mE_.m.trainables.syn1neg, mE_.m.wv.vectors, (w2i, i2w))
m_later = docscores.EmdsModel(mL_.m.trainables.syn1neg, mL_.m.wv.vectors, (w2i, i2w))

In [11]:
start_time = time ()
scores = scoreDocs (DOCS_FILE, m_early, m_later, "purgation", nLines=10000, k=10)
print ("Time taken: {0}".format ((time() - start_time)/60))

Time taken: 0.10981211264928183


In [66]:
start_time = time()
scores = multiScoreDocs (DOCS_FILE, m_early, m_later, semantic_features[0:100], linenums=range(10000), k=10)
print ("Time taken: {0}".format ((time() - start_time)/60))

Time taken: 0.4949785510698954


In [67]:
with open (os.path.join (STATS_DIR, "ops.list")) as fin:
    ids = [(i, int(line.strip())) for i,line in enumerate (fin)]
dict_ids = {elem[0]: elem[1] for elem in ids}

In [71]:
# initialize a seed for repeatability
np.random.seed(100)
indices = np.random.choice(len(ids), 500000, replace=False)

In [72]:
scores = multiScoreDocs (DOCS_FILE, m_early, m_later, semantic_features, linenums=indices, k=10)

In [70]:
feats = [scores[i] for i in indices]
ops = [dict_ids[i] for i in indices]

#for i in indices:
#    j = np.argmax (np.array(list(scores[i].values())))
#    feats.append ((list(scores[i].keys())[j], list(scores[i].values())[j]))
    
#ops = [dict_ids[i] for i in indices]

with open (os.path.join (STATS_DIR, "ops.temp.semfeat"), "w") as fout:
    for i in range (len (feats)):
        fout.write ("{0},{1},{2}\n".format (ops[i], feats[i][0], feats[i][1]))