<a href="https://colab.research.google.com/github/project-ccap/project-ccap.github.io/blob/master/notebooks/2021_0907word2vec_cos_and_euc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
import os
import platform

# word2vec のため gensim を使う
from gensim.models import KeyedVectors
from gensim.models import Word2Vec


class ccap_w2v():
    
    def __init__(self):
        # local Mac で実行しているか, それとも colab 上で実行しているかを判定
        (isMac, isColab) = (True, False) if platform.system() == 'Darwin' else (False, True)
        is2017, is2021 = True, False

        if isColab:
            # 形態素分析ライブラリーMeCab と 辞書(mecab-ipadic-NEologd)のインストール 
            # reference: https://qiita.com/jun40vn/items/78e33e29dce3d50c2df1
            !apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab
            !git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
            !echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n
            !pip install mecab-python3
    
            # シンボリックリンクによるエラー回避
            !ln -s /etc/mecabrc /usr/local/etc/mecabrc    

            if is2017:
                # word2vec の訓練済モデルを入手
                !wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_cbow.bin.gz
                #!wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_sgns.bin.gz
                #!wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid300_win20_neg20_sgns.bin.gz'
                #!wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_cbow.bin.g
            else:    
                #訓練済 word2vec ファイルの取得
                #!wget --no-check-certificate --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1B9HGhLZOja4Xku5c_d-kMhCXn1LBZgDb' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1B9HGhLZOja4Xku5c_d-kMhCXn1LBZgDb" -O 2021_05jawiki_hid128_win10_neg10_cbow.bin.gz && rm -rf /tmp/cookies.txt
                #!wget --no-check-certificate --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1OWmFOVRC6amCxsomcRwdA6ILAA5s4y4M' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1OWmFOVRC6amCxsomcRwdA6ILAA5s4y4M" -O 2021_05jawiki_hid128_win10_neg10_sgns.bin.gz && rm -rf /tmp/cookies.txt
                !wget --no-check-certificate --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1JTkU5SUBU2GkURCYeHkAWYs_Zlbqob0s' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1JTkU5SUBU2GkURCYeHkAWYs_Zlbqob0s" -O 2021_05jawiki_hid200_win20_neg20_cbow.bin.gz && rm -rf /tmp/cookies.txt
                #!wget --no-check-certificate --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1VPL2Mr9JgWHik9HjRmcADoxXIdrQ3ds7' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1VPL2Mr9JgWHik9HjRmcADoxXIdrQ3ds7" -O 2021_05jawiki_hid200_win20_neg20_sgns.bin.gz && rm -rf /tmp/cookies.txt

        import MeCab

        # word2vec データの読み込み, ファイルの所在に応じて変更してください
        if is2017:
            w2v_base = '/Users/asakawa/study/2016wikipedia/' if isMac else '.'
            w2v_file = '2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_cbow.bin.gz'
            w2v_file = os.path.join(w2v_base, w2v_file)
        else:
            w2v_base = '/Users/asakawa/study/2019attardi_wikiextractor.git/wiki_texts/AA' if isMac else '.'
            w2v_file = '2021_05jawiki_hid128_win10_neg10_sgns.bin'

        if isColab:
            neologd_path = "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
        else:
            neologd_path = "-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd"

        w2v_base = '.' if isColab else w2v_base
        w2v_file = os.path.join(w2v_base, w2v_file)
        w2v = KeyedVectors.load_word2vec_format(w2v_file, 
                                                encoding='utf-8', 
                                                unicode_errors='replace',
                                                binary=True)
        self.w2v = w2v
        self.tagger = MeCab.Tagger('-Oyomi ' + neologd_path)
    


In [None]:
_w2v = ccap_w2v()
w2v = _w2v.w2v

In [None]:
# -*- coding: utf-8 -*-

import numpy as np
# source: https://github.com/paraschopra/one-network-many-uses
from scipy import spatial


frequency_threshold = 20000  #最初の単語だけ取り出す
all_word_embeddings = []
all_words = []
for word in list(w2v.vocab.keys())[:frequency_threshold]:
    all_word_embeddings.append(w2v[word])
    all_words.append(word)


def return_cosine_sorted(target_word_embedding):
    """
    all_word_embeddings で定義された単語ベクトルのリストから，cosine 類似度に基づいた np.array を返す
    """
    words = []
    cosines = []
    for i in range(len(all_word_embeddings)):
        cosines.append(1 - spatial.distance.cosine(target_word_embedding, all_word_embeddings[i]))

    sorted_indexes = np.argsort(cosines)[::-1]
    return np.vstack((np.array(all_words)[sorted_indexes], np.array(cosines)[sorted_indexes])).T


def return_euclidean_sorted(target_word_embedding):
    """
    all_word_embeddings で定義された単語ベクトルのリストから，Euclidean 類似度に基づいた np.array を返す
    """
    words = []
    euclideans = []
    for i in range(len(all_word_embeddings)):
        euclideans.append(spatial.distance.sqeuclidean(target_word_embedding, all_word_embeddings[i]))

    sorted_indexes = np.argsort(euclideans)
    return np.vstack((np.array(all_words)[sorted_indexes], np.array(euclideans)[sorted_indexes])).T


def return_similar_words(word, top_n=5, sim='cos'):
    """
    sim: 'cos' or 'euc' の差異によってどれくらい近接語が異なるのか？
    """
    
    if sim == 'cos':
        return return_cosine_sorted(return_embedding(word))[1:top_n+1]
    else:
        return return_euclidean_sorted(return_embedding(word))[1:top_n+1]


def return_embedding(word):
    if word in all_words:
        target_embedding_index = [i for i, s in enumerate(all_words) if word in s][0]
        return all_word_embeddings[target_embedding_index]
    else:
        return None


In [None]:
for word in ['秋', 'ジャズ']:
    print(f'{word}: {return_similar_words(word, sim="cos", top_n=5)}')
    print(f'{word}: {return_similar_words(word, sim="euc", top_n=5)}')