# 特徴量4 Word embedding
GloVeのpre-trained word vectorsを用いた

In [4]:
import csv
import numpy as np
import gensim.downloader as api

In [5]:
def doc_to_wv(doc, glove_wvs):
    words = doc.split()
    wv = [0] * len(words)
    for i, word in enumerate(words):
        try:
            # gloveでは全て小文字の単語
            wv[i] = glove_wvs[word.lower()]
        except:
            # 未知語はunkとして扱う
            wv[i] = glove_wvs['unk']
            
            # どのような単語が未知語とされているか見るとどんな前処理がされたか察せる
            # print('{} is unknown word'.format(word))
    return wv    
    
def docs_to_wvs(docs, glove_wvs):
    wvs = [0] * len(docs)
    for i, doc in enumerate(docs):
        wvs[i] = doc_to_wv(doc, glove_wvs)
    return wvs

In [6]:
def get_mean_wvs(wvs):
    mean_wvs = [0] * len(wvs)
    for i, wv in enumerate(wvs):
        mean_wvs[i] = np.mean(np.array(wv), axis=0)
    return mean_wvs

---

In [7]:
glove_wvs = api.load("glove-wiki-gigaword-100")

In [8]:
with open('../data/data.txt') as f:
    text = f.read()
    docs = text.split('\n')

In [9]:
wvs = docs_to_wvs(docs, glove_wvs) # (500, 2000~3000, 100)

In [10]:
mean_wvs = get_mean_wvs(wvs) # (500,100)

In [11]:
# 保存
with open('../data/word_emb_mean.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(mean_wvs)

参考
- [How to use pre-trained word vectors from Facebook’s fastText](https://blog.manash.me/how-to-use-pre-trained-word-vectors-from-facebooks-fasttext-a71e6d55f27)  
- [Word2Vec and FastText Word Embedding with Gensim](https://towardsdatascience.com/word-embedding-with-word2vec-and-fasttext-a209c1d3e12c)