In [1]:
from sklearn.manifold import Isomap
from sklearn.datasets import make_s_curve
from gensim.models import KeyedVectors
import gzip
import tarfile
from tqdm.auto import tqdm

## Extract first 500K words in tencent-d200-2M embeddings

In [2]:
gzf = gzip.open("../data/tencent-ailab-embedding-zh-d200-v0.2.0-s.tar.gz", "rb")
tarf = tarfile.open(fileobj=gzf)
tarinfo = tarf.getmember('tencent-ailab-embedding-zh-d200-v0.2.0-s/tencent-ailab-embedding-zh-d200-v0.2.0-s.txt')
embf = tarf.extractfile(tarinfo)
nvocab, ndim = [int(x) for x in 
                embf.readline().decode().strip().split()]
ln = embf.readline().decode().strip()
emb_vecs = []
vocabs = []

pbar = tqdm(total=500_000)
while ln:
    pbar.update(1)
    toks = ln.split()    
    tok0 = toks[0]
        
    vec = [float(x) for x in toks[1:]]
    vocabs.append(tok0)
    emb_vecs.append(vec)        
    
    if len(emb_vecs)>500_000:
        break
    ln = embf.readline().decode().strip()
    
pbar.close()

  0%|          | 0/500000 [00:00<?, ?it/s]

In [3]:
kv = KeyedVectors(200)
kv.add_vectors(vocabs, emb_vecs)

In [4]:
kv.save_word2vec_format("../data/tencent_d200_500k.bin", binary=True)

## Output Hash
```
../data/tencent_d200_500k.bin	a20a8c
```

In [5]:
from hashlib import sha1
from pathlib import Path
h = sha1()
h.update(Path("../data/tencent_d200_500k.bin").read_bytes())
print("../data/tencent_d200_500k.bin", h.hexdigest()[:6], sep="\t")

../data/tencent_d200_500k.bin	a20a8c
