# elasticsearchを用いた日本語文章検索のサンプルコード

## 環境構築

In [1]:
ES_VERSION = "7.9.0"

In [2]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-{ES_VERSION}-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-{ES_VERSION}-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-{ES_VERSION}
!pip install "elasticsearch<8.0.0"

Collecting elasticsearch<8.0.0
  Downloading elasticsearch-7.17.2-py2.py3-none-any.whl (385 kB)
[?25l[K     |▉                               | 10 kB 19.7 MB/s eta 0:00:01[K     |█▊                              | 20 kB 11.7 MB/s eta 0:00:01[K     |██▌                             | 30 kB 9.4 MB/s eta 0:00:01[K     |███▍                            | 40 kB 8.5 MB/s eta 0:00:01[K     |████▎                           | 51 kB 4.4 MB/s eta 0:00:01[K     |█████                           | 61 kB 5.2 MB/s eta 0:00:01[K     |██████                          | 71 kB 5.7 MB/s eta 0:00:01[K     |██████▉                         | 81 kB 5.8 MB/s eta 0:00:01[K     |███████▋                        | 92 kB 6.5 MB/s eta 0:00:01[K     |████████▌                       | 102 kB 5.3 MB/s eta 0:00:01[K     |█████████▍                      | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████▏                     | 122 kB 5.3 MB/s eta 0:00:01[K     |███████████                     | 133 kB 5.

In [3]:
!mkdir -p elasticsearch-{ES_VERSION}/data
!chown -R daemon:daemon "elasticsearch-{ES_VERSION}/data"
!chmod -R 755 "elasticsearch-{ES_VERSION}/data"

In [4]:
!elasticsearch-{ES_VERSION}/bin/elasticsearch-plugin install analysis-kuromoji

-> Installing analysis-kuromoji
-> Downloading analysis-kuromoji from elastic
-> Installed analysis-kuromoji


In [5]:
!elasticsearch-{ES_VERSION}/bin/elasticsearch-plugin list

analysis-kuromoji


# elasticsearchサーバーを起動する

※起動の完了には25秒ほどかかる

In [24]:
import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen([f"elasticsearch-{ES_VERSION}/bin/elasticsearch"], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [7]:
# デバッグ用
# while True:
#     line = es_server.stdout.readline()
#     print(line)
#     if not line and es_server.poll() is not None:
#         break

In [8]:
# サーバー停止
# es_server.kill()

# インデックスを作成して検索を試してみる

## 起動確認

elasticsearchサーバー起動から25秒ほど待ってから実行すること。  
起動が完了していないときは次のエラーが発生する。

"curl: (7) Failed to connect to localhost port 9200: Connection refused"

In [28]:
!curl -X GET "localhost:9200/"

{
  "name" : "2890cd352aee",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "0093OUI9QDa18qYxjo8plw",
  "version" : {
    "number" : "7.9.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "a479a2a7fce0389512d6a9361301708b92dff667",
    "build_date" : "2020-08-11T21:36:48.204330Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.0",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


## インデックスを作成

In [None]:
#!curl -H "Content-Type: application/json" -XDELETE 'localhost:9200/documents'

In [30]:
!curl -H "Content-Type: application/json" -XPUT 'localhost:9200/documents?pretty' -d '{ \
  "settings": { \
    "index": { \
      "number_of_shards": 1, \
      "number_of_replicas": 0 \
    }, \
    "analysis": { \
      "analyzer": { \
        "ja": { \
          "filter": [ \
            "cjk_width", \
            "lowercase" \
          ], \
          "char_filter": [ \
            "html_strip" \
          ], \
          "type": "custom", \
          "tokenizer": "ja_tokenizer" \
        } \
      }, \
      "tokenizer": { \
        "ja_tokenizer": { \
          "type": "kuromoji_tokenizer", \
          "mode": "search" \
        } \
      } \
    } \
  }, \
  "mappings": { \
    "properties": { \
      "title": { \
        "analyzer": "ja", \
        "type": "text" \
      }, \
      "content": { \
        "analyzer": "ja", \
        "type": "text" \
      }, \
      "genre_id": { \
        "type": "integer" \
      }, \
      "created": { \
        "type": "date" \
      } \
    } \
  } \
}'

{
  "acknowledged" : true,
  "shards_acknowledged" : true,
  "index" : "documents"
}


## 検索対象ドキュメントの準備

livedoorニュースを利用する。

In [18]:
!wget -O ldcc-20140209.tar.gz https://www.rondhuit.com/download/ldcc-20140209.tar.gz

--2022-04-11 09:43:29--  https://www.rondhuit.com/download/ldcc-20140209.tar.gz
Resolving www.rondhuit.com (www.rondhuit.com)... 59.106.19.174
Connecting to www.rondhuit.com (www.rondhuit.com)|59.106.19.174|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8855190 (8.4M) [application/x-gzip]
Saving to: ‘ldcc-20140209.tar.gz’


2022-04-11 09:43:35 (1.62 MB/s) - ‘ldcc-20140209.tar.gz’ saved [8855190/8855190]



In [31]:
# 正規化処理
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja から引用・一部改変
from __future__ import unicode_literals
import re
import unicodedata

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

In [32]:
import tarfile
import re
from datetime import datetime


target_genres = ["dokujo-tsushin",
                 "it-life-hack",
                 "kaden-channel",
                 "livedoor-homme",
                 "movie-enter",
                 "peachy",
                 "smax",
                 "sports-watch",
                 "topic-news"]

def normalize_text(text):
    text = normalize_neologd(text)
    return text

def read_title_body(file):
    next(file)
    next(file)
    title = next(file).decode("utf-8").strip()
    title = normalize_text(title)
    body = "\n".join([normalize_text(line.decode("utf-8").strip()) for line in file.readlines()])
    return title, body

genre_files_list = [[] for genre in target_genres]

all_data = []

with tarfile.open("ldcc-20140209.tar.gz") as archive_file:
    for archive_item in archive_file:
        for genre_id, genre in enumerate(target_genres):
            if genre in archive_item.name and archive_item.name.endswith(".txt"):
                genre_files_list[genre_id].append(archive_item.name)

    for genre_id, genre_files in enumerate(genre_files_list):
        for name in genre_files:
            file = archive_file.extractfile(name)
            title, body = read_title_body(file)
            title = normalize_text(title)
            body = normalize_text(body)

            if len(title) > 0 and len(body) > 0:
                all_data.append({
                    "title": title,
                    "content": body,
                    "genre_id": genre_id,
                    "created": datetime.now()
                    })

## ドキュメントの登録

In [33]:
import gzip, json
from elasticsearch import Elasticsearch
from tqdm import tqdm


es = Elasticsearch()

for id, datum in tqdm(enumerate(all_data), total=len(all_data)):
    res = es.index(index="documents", id=id, document=datum)

es.indices.refresh(index="documents")

100%|██████████| 7376/7376 [01:35<00:00, 77.48it/s]


{'_shards': {'failed': 0, 'successful': 1, 'total': 1}}

## 試しに検索

In [47]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

query = {
    "match": {
        "content": "人工知能"
    }
}

res = es.search(index="documents", query=query, size=10)
print(f"{res['hits']['total']['value']} hits")
for i, hit in enumerate(res['hits']['hits']):
    source = hit["_source"]
    print(f"{i+1:2}. ({hit['_score']:0.3f})　{source['title']}")
    # print(f"{i+1:2}. ({hit['_score']:0.3f})　{source['title']} {source['content']}")

41 hits
 1. (18.100)　今度はシャープから、人工知能搭載で自ら進んで節電する「プラズマクラスター冷蔵庫」新発売
 2. (15.224)　神を自負した男が、無限の野心の果てに追い求めたもの
 3. (15.100)　【ニュース】本格運用開始の「グーグル+」、中国では利用不可
 4. (15.100)　【話題】押すとしゃべる!テレ東が「ピカチュウリモコン」を7,777名にプレゼント!
 5. (14.861)　流行のミラーレスカメラがニコンからもついに!「Nikon 1 J1」「Nikon 1 V1」が発表
 6. (14.188)　パナソニックの「旅ナビ」に新モデル登場!「CN-MH01L」はガイドブックデータが充実
 7. (13.977)　女性人気の高いミラーレスカメラはオリンパス「PEN E-P3」
 8. (13.764)　人工知能からHCIへグーグルの生みの親が語る、コンピューターと人の関係(1)【テレスコープマガジン】
 9. (13.193)　シャープ、スマホで部屋の様子を確認できるロボット掃除機「COCOROBO」を発表
10. (12.168)　三菱電機から2つのセンサーで自動的に節電可能な頼もしい冷蔵庫「RXシリーズ」が新発売
