In [128]:
import os
import time
import sys
import numpy as np
if "../" not in sys.path: sys.path.append ("../")
from modules.semshift import embeddings, alignment, measures

In [289]:
STATS_DIR = "/hg191/corpora/legaldata/data/stats/"
MODELS_DIR = "/hg191/corpora/legaldata/models/"
NAMES_FILES = [os.path.join(STATS_DIR, "names.neural"), os.path.join(STATS_DIR, "names.tagging")]
SCORES_FILES = [os.path.join (STATS_DIR, "V.{0}.scores".format (seed)) for seed in [100, 200, 300, 400, 500]]

In [83]:
def readNames (filenames):
    names = set()
    for filename in filenames:
        with open (filename) as fin:
            for line in fin:
                names.add(line.strip())
    return names

In [84]:
names = readNames (NAMES_FILES)

In [5]:
early100 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.early.100.model"))
early200 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.early.200.model"))
early300 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.early.300.model"))
early400 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.early.400.model"))
early500 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.early.500.model"))

later100 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.later.100.model"))
later200 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.later.200.model"))
later300 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.later.300.model"))
later400 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.later.400.model"))
later500 = embeddings.TrainedModel(os.path.join(MODELS_DIR, "sgns.500K.later.500.model"))

In [6]:
print(np.dot(early100.m.wv.vectors[0,:], later100.m.wv.vectors[0,:]))

0.08591904


In [7]:
def alignTwoModels (first_model, second_model):
    second_model.m = alignment.smart_procrustes_align_gensim(first_model.m, second_model.m)
    return first_model, second_model

In [9]:
early100_, later100_ = alignTwoModels (early100, later100)
early200_, later200_ = alignTwoModels (early200, later200)
early300_, later300_ = alignTwoModels (early300, later300)
early400_, later400_ = alignTwoModels (early400, later400)
early500_, later500_ = alignTwoModels (early500, later500)

In [10]:
print(np.dot(early100.m.wv.vectors[0,:], later100.m.wv.vectors[0,:]))

0.6740516


In [13]:
vEarly100 = {key for key in early100_.m.wv.vocab.keys()}
vLater100 = {key for key in early100_.m.wv.vocab.keys()}
vCommon100 = vEarly100 & vLater100

vEarly200 = {key for key in early200_.m.wv.vocab.keys()}
vLater200 = {key for key in early200_.m.wv.vocab.keys()}
vCommon200 = vEarly200 & vLater200

vEarly300 = {key for key in early300_.m.wv.vocab.keys()}
vLater300 = {key for key in early300_.m.wv.vocab.keys()}
vCommon300 = vEarly300 & vLater300

vEarly400 = {key for key in early400_.m.wv.vocab.keys()}
vLater400 = {key for key in early400_.m.wv.vocab.keys()}
vCommon400 = vEarly400 & vLater400

vEarly500 = {key for key in early500_.m.wv.vocab.keys()}
vLater500 = {key for key in early500_.m.wv.vocab.keys()}
vCommon500 = vEarly500 & vLater500

We already constructed a list of words for each of these models above to have changed in meaning.

In [303]:
def scoresAsDict (filename):
    asdict = {}
    with open (filename) as fin:
        for line in fin:
            parts = line.strip().split (",")
            asdict[parts[0]] = float(parts[1])
    return asdict

In [304]:
wordscores = dict ()
for scorefile in SCORES_FILES:
    seednum = int(os.path.splitext(os.path.basename (scorefile))[0].split(".")[1])
    wordscores[seednum] = scoresAsDict (scorefile)

Apply a very conservative heuristic to get meaningful but high precision sublist of change words.

In [402]:
def getKeepAndThrowLists (scoresasdict, emod, lmod, keep_freq=25, throw_freq=75, frac=0.8, topn=50, k=10000):
    throw_words = list ()
    kept_words = list ()
    
    for word, score in sorted (scoresasdict.items(), key=lambda x:x[1], reverse=True)[0:k]:
        e_num_neighbors = len ([n for n, sim in emod.wv.most_similar(word, topn=topn) if n in scoresasdict])
        l_num_neighbors = len ([n for n, sim in lmod.wv.most_similar(word, topn=topn) if n in scoresasdict])
        
        if e_num_neighbors >= int(frac*topn) and l_num_neighbors >= int (frac*topn):
            if emod.wv.vocab[word].count >= keep_freq and lmod.wv.vocab[word].count >= keep_freq:
                kept_words.append (word)
            else:
                throw_words.append (word)
        else:
            if emod.wv.vocab[word].count >= throw_freq and lmod.wv.vocab[word].count >= throw_freq:
                kept_words.append (word) #keep a word if its high frequency (high-frequency bias)
            else:
                throw_words.append (word) #throw a word if its not high frequency 
    
    return kept_words, throw_words

In [404]:
seednum = 100
k100, t100 = getKeepAndThrowLists (wordscores[seednum], early100_.m, later100_.m, throw_freq=50, frac=0.8, topn=50, k=10000)

seednum = 200
k200, t200 = getKeepAndThrowLists (wordscores[seednum], early200_.m, later200_.m, throw_freq=50, frac=0.8, topn=50, k=10000)

seednum = 300
k300, t300 = getKeepAndThrowLists (wordscores[seednum], early300_.m, later300_.m, throw_freq=50, frac=0.8, topn=50, k=10000)

seednum = 400
k400, t400 = getKeepAndThrowLists (wordscores[seednum], early400_.m, later400_.m, throw_freq=50, frac=0.8, topn=50, k=10000)

seednum = 500
k500, t500 = getKeepAndThrowLists (wordscores[seednum], early500_.m, later500_.m, throw_freq=50, frac=0.8, topn=50, k=10000)

  if np.issubdtype(vec.dtype, np.int):


In [405]:
keeplists = [k100, k200, k300, k400, k500]
throwlists = [t100, t200, t300, t400, t500]

seednums = [100, 200, 300, 400, 500]

for i, seednum in enumerate(seednums):
    with open (os.path.join (STATS_DIR, "V.{0}.keeplist".format (seednum)), "w") as fout:
        for w in keeplists[i]:
            fout.write ("{0}\n".format (w))
    
    with open (os.path.join (STATS_DIR, "V.{0}.throwlist".format (seednum)), "w") as fout:
        for w in throwlists[i]:
            fout.write ("{0}\n".format (w))

In [211]:
def nearest_neighbors (emds, word, topn=50):
    return [neighbor for neighbor, similarity in emds.wv.most_similar(word, topn=topn)]

In [409]:
with open (os.path.join (STATS_DIR, "V.100.nns"), "w") as fout:
    for i,w in enumerate(keeplists[0]):
        fout.write ("Rank: {0}, Word:{1}\n".format(i, w))
        fout.write ("Early:" + ",".join(nearest_neighbors(early100_.m, w, topn=10)) + "\n")
        fout.write ("Later:" + ",".join(nearest_neighbors(later100_.m, w, topn=10)) + "\n")

  if np.issubdtype(vec.dtype, np.int):


In [386]:
#[w for w in keeplists[0] if early100_.m.wv.vocab[w].count <= 25 and later100_.m.wv.vocab[w].count <= 25]
print (len([w for w in throwlists[0] if early100_.m.wv.vocab[w].count >= 50 and later100_.m.wv.vocab[w].count >= 50]))
#sum([early100_.m.wv.vocab[w].count >= 100 and later100_.m.wv.vocab[w].count >= 100 for w in throwlists[0]]), len (throwlists[0])

1643


In [394]:
early100_.m.wv.vocab["kingpin"].count

11

In [356]:
print(nearest_neighbors(early100_.m, "shsll", topn=10))
print(nearest_neighbors(later100_.m, "shsll", topn=10))

['ahall', 'rhall', 'ehall', 'suah', 'aad', 'sor', 'aball', 'auoh', 'ior', 'suoh']
['nass', 'mous', 'riehm', 'impropriety', 'unadmitted', 'neidhardt', 'impugned', 'canvass', 'unproven', 'blessed']


  if np.issubdtype(vec.dtype, np.int):
