In [4]:
import numpy as np
import gensim

from gensim.models import Word2Vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

from gensim.models.wrappers import FastText


from tqdm import tqdm
from sklearn.decomposition import PCA

# Reduction of dimension for different embeddings

## GGnews format 
It works for pubmed embeddings and ggnews.
Dowloaded from [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) for ggnews. It works also for the gensim fastwiki binary file that I obtained [here](https://fasttext.cc/docs/en/pretrained-vectors.html). 

In [2]:
path2ggnews = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(path2ggnews, binary=True)

In [3]:
vocab = []
for w in model.vocab:
    vocab.append(w)
print(len(vocab))

3000000


In [4]:
weight_matrix = np.zeros((len(vocab), len(model['the'])))
for i, w in tqdm(enumerate(vocab)):
    weight_matrix[i,:] = model[w]
print(weight_matrix.shape)

3000000it [00:27, 109855.73it/s]

(3000000, 300)





#### Reduced dimension and write reduced model to txt file

In [10]:
reduced_dim = 100
reduced_matrix = PCA(n_components=reduced_dim).fit_transform(weight_matrix)
print(reduced_matrix.shape)

(3000000, 100)


In [11]:
reduced_txt = []
reduced_txt.append(str(len(vocab))+' '+str(reduced_dim)+'\n')
for i,w in tqdm(enumerate(vocab)):
    l = w+' '+' '.join([str(c) for c in reduced_matrix[i]])+'\n'
    reduced_txt.append(l)

3000000it [08:34, 5827.23it/s]


In [12]:
with open('model_reduced.txt', 'w') as f:
    f.writelines(reduced_txt)

### Build word2vec format

In [13]:
word_vectors = KeyedVectors.load_word2vec_format('model_reduced.txt', binary=False)

In [14]:
model = word_vectors
emb_name = 'ggnews_{dim}.bin'.format(dim = reduced_dim)
model.wv.save_word2vec_format(emb_name, binary= True)

word_vectors