## learning notes taken from "Improving Distributional Similarity with Lessons Learned from Word Embeddings" by Omer Levy, Yoav Goldberg, and Ido Dagan. TACL 2015.
- The purpose of the notes is to understand the paper and the author's implementation methods - we tried different parameter settings and different svd libraries.
- the original codes are a mix of linux-commands and python scripts, we try to redo everything in python
- all credits go to the authors of orignal paper: Omer Levy, Yoav Goldberg, and Ido Dagan
- [paper link "Improving Distributional Similarity with Lessons Learned from Word Embeddings"](https://levyomer.wordpress.com/2015/03/30/improving-distributional-similarity-with-lessons-learned-from-word-embeddings/)
- [bitbucket code repository](https://bitbucket.org/omerlevy/hyperwords)

### corpus2pairs.py

In [1]:
from collections import Counter
from math import sqrt
from random import Random

rnd = Random(17)

In [49]:
corpus_file = "data/text8"

thr = 100 ## lowest freq for rare words

subsample = 1e-5 ## high freq stop-words

dyn = True ## dynamic window
win = 3 ## window length

pos = False ## pos matters in context

d3l = True ## dirty play or not

cds = 0.75 ## context distribution smoothing

dim = 200 ## dimension for learned word vectors

neg = 1. ## k for negative sampling

normalize = True ## normalize the word vector matrix

eig = 0.5 ## eigen value weighting

In [50]:
def read_vocab(corpus_file, thr):
    vocab = Counter()
    with open(corpus_file) as f:
        for line in f:
            vocab.update(Counter(line.strip().split()))
    return dict([(token, count) for token, count in vocab.items() if count >= thr])

In [51]:
%time vocab = read_vocab(corpus_file, thr)
corpus_size = sum(vocab.values())
print corpus_size

CPU times: user 12.5 s, sys: 3.28 s, total: 15.7 s
Wall time: 15.8 s
15471435


In [52]:
subsample *= corpus_size
subsampler = dict([ (word, 1 - sqrt(subsample / count)) for word, count in vocab.items() if count > subsample])

In [53]:
pairs = []
with open(corpus_file) as f:
    for line in f:
        tokens = [t if t in vocab else None for t in line.strip().split()] ## del-rare filter
        if subsample != 0:
            tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None 
                          for t in tokens] ##subsampling filter
        if d3l:
            tokens = [t for t in tokens if t is not None]
        len_tokens = len(tokens)
        
        for i, tok in enumerate(tokens):
            if tok is None: continue
            dynamic_window = rnd.randint(1, win) if dyn else win
            start, end = max(0, i-dynamic_window), min(i+dynamic_window+1, len_tokens)
            for n in xrange(start, end):
                if n == i: continue
                neighbor_word = tokens[n]+"_"+str(n-i) if pos else tokens[n]
                pairs.append((tok, neighbor_word))

### pairs2counts.sh

In [54]:
%time paircounts = Counter(pairs)

CPU times: user 9.05 s, sys: 1.17 s, total: 10.2 s
Wall time: 10.1 s


In [55]:
print len(paircounts)

9190262


### counts2vocab.py

In [56]:
wordcounts = Counter()
contextcounts = Counter()
for (word, context), count in paircounts.items():
    wordcounts[word] += count
    contextcounts[context] += count

In [57]:
words = sorted(wordcounts.items(), key = lambda (k, v): v, reverse = True)
contexts = sorted(contextcounts.items(), key = lambda (k, v): v, reverse = True)

In [58]:
len(words), len(contexts), len(words)+len(contexts)

(11815, 11815, 23630)

### counts2pmi.py

In [59]:
from scipy.sparse import csr_matrix, dok_matrix
import numpy as np

In [60]:
def multiply_by_rows(matrix, row_coefs):
    normalizer = dok_matrix((len(row_coefs), len(row_coefs)))
    normalizer.setdiag(row_coefs)
    return normalizer.tocsr().dot(matrix)

def multiply_by_columns(matrix, col_coefs):
    normalizer = dok_matrix((len(col_coefs), len(col_coefs)))
    normalizer.setdiag(col_coefs)
    return matrix.dot(normalizer.tocsr())

In [61]:
iw = sorted(map(lambda (w, count): w, words))
ic = sorted(map(lambda (c, count): c, contexts))
wi = dict([(w, i) for i, w in enumerate(iw)])
ci = dict([(c, i) for i, c in enumerate(ic)])

In [62]:
counts = dok_matrix((len(wi), len(ci)), dtype = np.float32)
for (word, context), count in paircounts.items():
    if word in wi and context in ci:
        counts[wi[word], ci[context]] = count
%time counts = counts.tocsr()

CPU times: user 10.1 s, sys: 175 ms, total: 10.3 s
Wall time: 10.3 s


In [63]:
## calculate e^PMI; PMI without the log()
sum_w = np.array(counts.sum(axis=1))[:, 0] 
sum_c = np.array(counts.sum(axis=0))[0, :]
assert np.sum(sum_w) == np.sum(sum_c)
if cds != 1.:
    sum_c = sum_c ** cds
sum_total = sum_c.sum() 
sum_w = np.reciprocal(sum_w)
sum_c = np.reciprocal(sum_c)
## the total is not the total, smoothed version of total - D
## so the #(w) is not smoothed, #(c) and D are smoothed

pmi = csr_matrix(counts)
%time pmi = multiply_by_rows(pmi, sum_w)
%time pmi = multiply_by_columns(pmi, sum_c)
%time pmi = pmi * sum_total

CPU times: user 173 ms, sys: 35.9 ms, total: 208 ms
Wall time: 208 ms
CPU times: user 220 ms, sys: 22.2 ms, total: 242 ms
Wall time: 242 ms
CPU times: user 15.6 ms, sys: 21.1 ms, total: 36.7 ms
Wall time: 36.9 ms


### pmi2svd.py

In [64]:
from sparsesvd import sparsesvd
from sklearn.decomposition import TruncatedSVD

In [65]:
def normalize_rows(m):
    m2 = m.copy()
    m2.data **= 2
    norm = np.reciprocal(np.sqrt(np.array(m2.sum(axis=1))[:, 0]))
    normalizer = dok_matrix((len(norm), len(norm)))
    normalizer.setdiag(norm)
    return normalizer.tocsr().dot(self.m)

In [66]:
explicit_pmi = pmi.copy()
explicit_pmi.data = np.log(pmi.data) - np.log(neg)
explicit_pmi.data[explicit_pmi.data < 0] = 0
explicit_pmi.eliminate_zeros()

In [26]:
## CHOICE 1 - SVD by sparsesvd
%time ut, s, vt = sparsesvd(explicit_pmi.tocsc(), dim, )
## vector representations are rows of wordvecs
%time wordvecs = np.power(s, eig) * ut.T ## scaling the rows

CPU times: user 1min 17s, sys: 440 ms, total: 1min 18s
Wall time: 1min 18s
CPU times: user 3.17 ms, sys: 600 µs, total: 3.77 ms
Wall time: 3.77 ms


In [67]:
## CHOICE 2 - SVD by sklearn - arpack
#from scipy.sparse import linalg
from sklearn.utils import arpack
from sklearn.utils import extmath

%time u, s, vt = arpack.svds(explicit_pmi.tocsr(), k = dim)
s = s[::-1]
u, vt = extmath.svd_flip(u[:, ::-1], vt[::-1])

%time wordvecs = np.power(s, eig) * u

CPU times: user 8min 21s, sys: 9min 50s, total: 18min 12s
Wall time: 35.8 s
CPU times: user 8.26 ms, sys: 0 ns, total: 8.26 ms
Wall time: 8.22 ms


In [39]:
## CHOICE 3 - SVD by sklearn - randomized svd
from sklearn.utils import extmath
%time u, s, vt = extmath.randomized_svd(explicit_pmi.tocsr(), dim, n_iter=10)
%time wordvecs = np.power(s, eig) * u

CPU times: user 1min 31s, sys: 22.9 s, total: 1min 54s
Wall time: 1min 1s
CPU times: user 4.56 ms, sys: 0 ns, total: 4.56 ms
Wall time: 4.47 ms


In [111]:
%%capture

## CHOICE 4 - SVD by spark
nrows, ncols = explicit_pmi.shape
rows, cols = explicit_pmi.nonzero()
with open("data/tmp/pmi.txt", "w") as f:
    f.write("# %d %d\n" % (nrows, ncols)) # dimension
    %time f.write("\n".join(["%d,%d,%g" % (r, c, d) for r, c, d in zip(rows, cols, explicit_pmi.data)]))
    
    
### run spark code - looking at data/tmp/U.txt data/tmp/s.txt
!/usr/spark/bin/spark-shell --master local[20] --driver-memory 50g -i rowmatrixSVD.scala

## load back matrices
%time u = np.loadtxt("data/tmp/U.txt", delimiter=",")
%time s = np.loadtxt("data/tmp/s.txt", delimiter=",")
print u.shape, s.shape
%time wordvecs = np.power(s, eig) * u

CPU times: user 16.8 s, sys: 2.06 s, total: 18.9 s
Wall time: 18.6 s
CPU times: user 1.51 s, sys: 46.8 ms, total: 1.56 s
Wall time: 1.56 s
CPU times: user 653 µs, sys: 743 µs, total: 1.4 ms
Wall time: 1.4 ms
(11815, 200) (200,)
CPU times: user 2.71 ms, sys: 3.85 ms, total: 6.56 ms
Wall time: 6.61 ms


In [112]:
wordvecs.shape

(11815, 200)

In [113]:
## normalize word vectors
from sklearn import preprocessing
normalized_wordvecs = preprocessing.normalize(wordvecs, axis = 1)

In [114]:
normalized_wordvecs

array([[-0.55885984,  0.21545189,  0.11841743, ...,  0.0013745 ,
        -0.02271895,  0.04978845],
       [-0.21680112,  0.01875686,  0.05588003, ..., -0.0771681 ,
        -0.03484708, -0.02163088],
       [-0.19255549, -0.19141807,  0.16705572, ...,  0.06263481,
         0.06455367, -0.05304903],
       ..., 
       [-0.17444888, -0.10237271,  0.01706275, ...,  0.02818981,
        -0.04560365, -0.00398814],
       [-0.20024668, -0.10312052, -0.10649699, ..., -0.03765361,
         0.04984179,  0.0461817 ],
       [-0.16624444, -0.09761244, -0.02911183, ..., -0.02920824,
         0.09424746, -0.04249255]])

In [117]:
s = np.random.randint(0, len(iw), 50)
for w in np.array(iw)[s]:
#for w in ["east", "china", "money", "football", "library"]:
#for w in ["tiger", "cat", "dog", "bear", "hamster", "mammal", "gem", "drink", "king", "queen"]:
    v = normalized_wordvecs[wi[w]]
    scores = normalized_wordvecs.dot(v)
    closest = sorted(enumerate(scores), key = lambda (i,s):s, reverse=True)[:10]
    print w, [iw[i] for (i, _) in closest]
    print 

layout ['layout', 'rows', 'keys', 'diatonic', 'chromatic', 'tile', 'row', 'keyboard', 'harmonica', 'placement']

taiwan ['taiwan', 'korea', 'china', 'prc', 'singapore', 'malaysia', 'shanghai', 'hong', 'chiang', 'thailand']

operative ['operative', 'lodge', 'freemasonry', 'organisation', 'membership', 'recognition', 'benefits', 'societies', 'participation', 'masonic']

studios ['studios', 'theaters', 'bros', 'productions', 'studio', 'movies', 'warner', 'films', 'movie', 'cinema']

mexican ['mexican', 'jos', 'luis', 'rican', 'argentine', 'juan', 'manuel', 'cruz', 'santiago', 'pedro']

organ ['organ', 'harpsichord', 'cello', 'violin', 'sonata', 'concerto', 'organs', 'orchestral', 'piano', 'quartet']

narrator ['narrator', 'cameo', 'novel', 'dracula', 'kubrick', 'story', 'comedy', 'sequel', 'vampire', 'tale']

usefulness ['usefulness', 'tool', 'applications', 'users', 'difficult', 'fix', 'need', 'easy', 'reasons', 'discussion']

diagnosis ['diagnosis', 'diagnostic', 'symptoms', 'disorders'

In [116]:
def vec(w): return normalized_wordvecs[wi[w], :]
analogy = ["king", "man", "queen", "woman"] #["france", "paris", "uk", "london"]
a, b, aa, bb = map(vec, analogy)
bb_estimate = b - a + aa
bb_estimate /= np.sqrt(np.sum(bb_estimate**2))
scores = normalized_wordvecs.dot(bb_estimate)
closest = sorted(enumerate(scores), key = lambda (i,s):s, reverse=True)[:10]
print [iw[i] for (i, _) in closest]

['man', 'queen', 'girl', 'love', 'my', 'woman', 'her', 'friends', 'she', 'thinks']
