# あなたの文章に合った「いらすとや」画像をレコメンド♪（アルゴリズム実装編）

解説記事: https://qiita.com/sonoisa/items/775ac4c7871ced6ed4c3

In [0]:
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab
!apt-get -q -y install swig 
!pip install mecab-python3 pymagnitude

In [0]:
!wget "https://www.floydhub.com/api/v1/resources/SnBYkUGB9PdsbQMWbBb9jn?content=true&download=true&rename=sonobe-datasets-fasttext_model-4" -O sonobe-datasets-fasttext_model-4.tar

In [0]:
!tar xvf sonobe-datasets-fasttext_model-4.tar

In [0]:
!wget "https://www.floydhub.com/api/v1/resources/n52RTWdCosGvud4gitpE5b/irasuto_items_part.json?content=true&rename=irasuto_items_partjson" -O irasuto_items_part.json

In [0]:
from pymagnitude import *
fasttext_model = Magnitude("jawiki.ipadic.fasttext.ws5-neg5-epoch5.magnitude", normalized=False, ngram_oov=True, case_insensitive=True)

In [0]:
similarities = fasttext_model.most_similar(positive=['王子', '女'], negative=['男'])
similarities

In [0]:
import numpy as np
def cos_sim(v1, v2):
    v1 = v1 / np.linalg.norm(v1, axis=0, ord=2)
    v2 = v2 / np.linalg.norm(v2, axis=0, ord=2)
    return np.sum(v1 * v2)

In [0]:
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja から引用・一部改変
from __future__ import unicode_literals
import re
import unicodedata

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    s = s.upper()
    return s

In [0]:
def normalize_text(text):
    return normalize_neologd(text)

In [0]:
import MeCab
mecab = MeCab.Tagger() # MeCab.Tagger("-d /usr/local/lib/mecab/dic/ipadic")

In [0]:
class Morph(object):
    def __init__(self, surface, pos, base):
        self.surface = surface
        self.pos = pos
        self.base = base
    def __repr__(self):
        return str({
            "surface": self.surface,
            "pos": self.pos,
            "base": self.base
        })

def tokenize(sentence):
    sentence = normalize_text(sentence)
    mecab.parse("")
    lines = mecab.parse(sentence).split("\n")
    tokens = []
    for line in lines:
        elems = line.split("\t")
        if len(elems) < 2:
            continue
        surface = elems[0]
        if len(surface):
            feature = elems[1].split(",")
            base = surface if len(feature) < 7 or feature[6] == "*" else feature[6]
            pos = ",".join(feature[0:4])
            tokens.append(Morph(surface=surface, pos=pos, base=base))
    return tokens

In [0]:
tokenize("MeCabを用いて正規化済み文字列を形態素解析します！！")

In [0]:
import json

with open('irasuto_items_part.json', 'r') as items_file:
    items = json.load(items_file)

In [0]:
stop_pos = {
    "助詞,格助詞,一般,*",
    "助詞,格助詞,引用,*",
    "助詞,格助詞,連語,*",
    "助詞,係助詞,*,*",
    "助詞,終助詞,*,*",
    "助詞,接続助詞,*,*",
    "助詞,特殊,*,*",
    "助詞,副詞化,*,*",
    "助詞,副助詞,*,*",
    "助詞,副助詞／並立助詞／終助詞,*,*",
    "助詞,並立助詞,*,*",
    "助詞,連体化,*,*",
    "助動詞,*,*,*",
    "記号,句点,*,*",
    "記号,読点,*,*",
    "記号,空白,*,*",
    "記号,一般,*,*",
    "記号,アルファベット,*,*",
    "記号,一般,*,*",
    "記号,括弧開,*,*",
    "記号,括弧閉,*,*",
    "動詞,接尾,*,*",
    "動詞,非自立,*,*",
    "名詞,非自立,一般,*",
    "名詞,非自立,形容動詞語幹,*",
    "名詞,非自立,助動詞語幹,*",
    "名詞,非自立,副詞可能,*",
    "名詞,接尾,助動詞語幹,*",
    "名詞,接尾,人名,*",
    "接頭詞,名詞接続,*,*"
}

vocab = {}
for item in items:
    desc = item["desc"]
    title = item["title"]
    tokens = tokenize(desc)
    for token in tokens:
        key = token.base
        pos = token.pos
        is_stop = pos in stop_pos
        v = vocab.get(key, { "count": 0, "pos": pos , "stop": is_stop})
        v["count"] += 1
        vocab[key] = v

vocab_list = []
for k in vocab:
    v = vocab[k]
    if not v["stop"]:
        vocab_list.append((v["count"], k, v["pos"], v["stop"]))

In [0]:
vocab_list = sorted(vocab_list, reverse=True)
vocab_list[:10]

In [0]:
stop_word = [w[1] for w in vocab_list[:4]]
stop_word

In [0]:
import re
stop_word_regex = [ re.compile("^[!?]+$")]

In [0]:
def get_sentence_vector(sentence):
    tokens = tokenize(sentence)
    vecs = []
    for token in tokens:
        if is_stop(token):
            continue
        surface = token.surface
        v = fasttext_model.query(surface)
#         v = v / np.linalg.norm(v, axis=0, ord=2)
        vecs.append(v)

    sent_vec = None
    for vec in vecs:
        if sent_vec is None:
            sent_vec = vec
        else:
            sent_vec = sent_vec + vec
    return sent_vec

def is_stop(token):
    return token.pos in stop_pos or token.base in stop_word or any([r for r in stop_word_regex if r.match(token.base) is not None])

In [0]:
get_sentence_vector("与えられた文から文の分散表現を計算します。")

In [0]:
from tqdm import tqdm
for item in tqdm(items):
    desc = item["desc"]
    desc_vec = get_sentence_vector(desc)
    item["vec"] = desc_vec

In [0]:
from IPython.display import display, HTML, clear_output
from html import escape

def search_irasuto(sentence, top_n=3):
    sentence_vector = get_sentence_vector(sentence)
    sims = []
    if sentence_vector is None:
        print("検索できない文章です。もう少し文章を長くしてみてください。")
    else:
        for item in items:
            v = item["vec"]
            if v is None:
                sims.append(-1.0)
            else:
                sim = cos_sim(sentence_vector, v)
                sims.append(sim)
    
    count = 0
    for index in np.argsort(sims)[::-1]:
        if count >= top_n:
            break
        item = items[index]
        desc = escape(item["desc"])
        imgs = item["imgs"]
        if len(imgs) == 0:
            continue
        img = imgs[0]
        page = item["page"]
        sim = sims[index]
        display(HTML("<div><a href='" + page + "' target='_blank' rel='noopener noreferrer'><img src='" + img + "' width='100'>" + str(sim) + ": " + desc + "</a><div>"))
        count += 1

In [0]:
search_irasuto(sentence="暴走したAI", top_n=5)

In [0]:
search_irasuto(sentence="いらすとやさんに惜しみない拍手を", top_n=1)

In [0]:
search_irasuto(sentence="つづく", top_n=1)