# Decompose Vectors

In [1]:
import numpy as np
from gensim.models import KeyedVectors

## Data dependencies

```
../data/tencent_d200_500k.bin 53f5b4
```

In [2]:
from hashlib import sha1
from pathlib import Path
paths = ["../data/tencent_d200_500k.bin"]
for path_x in paths:
    h = sha1()
    h.update(Path(path_x).read_bytes())
    print(path_x, h.hexdigest()[:6])

../data/tencent_d200_500k.bin 53f5b4


## Decomposition relations

\begin{align}
f(c_1, c_2) =& f(c_1, *) + f(*, c_2) - f(c_1, *) - f(*, c_2)\\
             & + f(c_1, c_2) - f(c_1, c_2)\\
             & + f(c_1, c_2) \\
          =& \left[f(c_1, c_2) - f(*, c_2)\right]  \\
           & + \left[f(c_1, c_2) - f(c_1, *) \right] \\
           & + \left[f(c_1, *) + f(*, c_2) - f(c_1, c_2)\right] \\
          = & \delta(c_1|c_1, c_2) + \delta(c_2|c_1,c_2) + \delta(c_1, c_2)
\end{align}

In [3]:
kv = KeyedVectors.load_word2vec_format("../data/tencent_d200_500k.bin", binary=True)

In [4]:
biwords = []
for x in kv.key_to_index.keys():
    ## there are already some tokens including '*' in embedding
    if len(x) != 2 or '*' in x: continue
    biwords.append(x)

In [5]:
len(biwords)

121562

In [6]:
vocab_c1 = {}
vocab_c2 = {}
for w in biwords:
    c1, c2 = list(w)
    vocab_c1.setdefault(c1, []).append(w)
    vocab_c2.setdefault(c2, []).append(w)
len(vocab_c1), len(vocab_c2)

(5340, 5266)

In [7]:
def compute_mean_vec(wlist):
    return np.vstack([kv.get_vector(x, norm=False) for x in wlist]).mean(axis=0)

In [8]:
from tqdm.auto import tqdm
mu_list = []
n_mu = len(vocab_c1) + len(vocab_c2)
mu_vecs = np.zeros((n_mu, kv.vector_size))
for c1x, wlistx in tqdm(vocab_c1.items()):    
    # compute \mu_c1 = mean(f(c1, *))
    if len(wlistx) == 1: continue
    vecx = compute_mean_vec(wlistx)
    mu_vecs[len(mu_list), :] = vecx
    mu_list.append(f"{c1x}*")

for c2x, wlistx in tqdm(vocab_c2.items()):    
    # compute \mu_c2 = mean(f(*, c2))
    if len(wlistx) == 1: continue
    vecx = compute_mean_vec(wlistx)
    mu_vecs[len(mu_list), :] = vecx
    mu_list.append(f"*{c2x}")

mu_vecs = mu_vecs[:len(mu_list),]

  0%|          | 0/5340 [00:00<?, ?it/s]

  0%|          | 0/5266 [00:00<?, ?it/s]

In [9]:
len(mu_list), mu_vecs.shape

(7950, (7950, 200))

In [10]:
mu_kv = KeyedVectors(mu_vecs.shape[1])
mu_kv.add_vectors(mu_list, mu_vecs)

In [11]:
delta_list = []
n_delta = len(biwords) * 3
delta_vecs = np.zeros((n_delta, kv.vector_size))
for wx in tqdm(biwords):
    c1x, c2x = list(wx)
    wvecx = kv.get_vector(wx, norm=False)
    
    mu1x = f"{c1x}*"
    mu2x = f"*{c2x}"
    if mu1x in mu_kv:
        mu_c1 = mu_kv.get_vector(f"{c1x}*", norm=False)
        delta_c2 = wvecx - mu_c1
        delta_vecs[len(delta_list), :] = delta_c2
        delta_list.append(f"d2({c2x}|{wx})")    
    else:
        mu_c1 = wvecx
        
    if mu2x in mu_kv:
        mu_c2 = mu_kv.get_vector(f"*{c2x}", norm=False)
        delta_c1 = wvecx - mu_c2
        delta_vecs[len(delta_list), :] = delta_c1
        delta_list.append(f"d1({c1x}|{wx})")    
    else:
        mu_c2 = wvecx
    
    delta_c12 = mu_c1 + mu_c2 - wvecx
    delta_vecs[len(delta_list), :] = delta_c12    
    delta_list.append(f"dw({wx})")

delta_vecs = delta_vecs[:len(delta_list), :]

  0%|          | 0/121562 [00:00<?, ?it/s]

In [12]:
len(delta_vecs), delta_vecs.shape

(362030, (362030, 200))

In [13]:
## replace=True is important. We need to replace the existing '*' entries.
kv.add_vectors(mu_list, mu_vecs, replace=True)
kv.add_vectors(delta_list, delta_vecs)

In [14]:
np.where(delta_vecs.sum(1)==0)[0]

array([], dtype=int64)

In [15]:
kv.vectors.shape

(869958, 200)

In [16]:
## Test equality
import random
random.seed(12345)
sample_words = random.sample(biwords, 1000)

for sample_x in sample_words:
    tgt_word = sample_x
    wv_x = kv.get_vector(tgt_word, norm=False)
    tgt_c1, tgt_c2 = list(tgt_word)
    delta_c1 = f"d1({tgt_c1}|{tgt_word})"
    delta_c2 = f"d2({tgt_c2}|{tgt_word})"
    delta_w = f"dw({tgt_word})"    
    delta_c1v = 0; delta_c2v = 0
    if delta_c1 in kv: delta_c1v = kv.get_vector(delta_c1, norm=False)
    if delta_c2 in kv: delta_c2v = kv.get_vector(delta_c2, norm=False)
    recon_v = delta_c1v + delta_c2v + kv.get_vector(delta_w, norm=False)
    try:
        assert np.allclose(recon_v, wv_x, atol=1e-5)
    except:
        print(sample_x, ((recon_v-wv_x)**2).sum())
        

In [17]:
kv.allocate_vecattrs()
kv.save_word2vec_format("../data/delta_tenc_d200_biwords.bin", binary=True)

## Output Hashes

```
../data/delta_tenc_d200_biwords.bin 04e3f4
```

In [18]:
paths = ["../data/delta_tenc_d200_biwords.bin"]
for path_x in paths:
    h = sha1()
    h.update(Path(path_x).read_bytes())
    print(path_x, h.hexdigest()[:6])

../data/delta_tenc_d200_biwords.bin 04e3f4
