In [3]:
import os

import pyterrier as pt
import pandas as pd
from sudachipy import dictionary, tokenizer

In [4]:
if not pt.started():
  pt.init()

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [85]:
df = pd.DataFrame([
        ["d1", "検索方法の検討"]
    ], columns=["docno", "text"])

df

Unnamed: 0,docno,text
0,d1,検索方法の検討


In [86]:
class DocTokenizer():
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C

    def tokenize(self, txt: str) -> list[str]:
        return [
            m.dictionary_form() for m in self.tokenizer_obj.tokenize(txt, self.mode)
            if len(set(['名詞', '動詞', '形容詞', '副詞', '形状詞']) & set(m.part_of_speech())) != 0
        ]


class QueryTokenizer():
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C

    def tokenize(self, txt: str) -> list[str]:
        return [m.surface() for m in self.tokenizer_obj.tokenize(txt, self.mode)]


class TokenizeDoc():
    tokenizer = DocTokenizer()

    def tokenize(self, df: pd.DataFrame):
        df['tokens'] = df['text'].apply(lambda x: ' '.join(self.tokenizer.tokenize(x)))
        return df


class PhraseQueryConverter():
    query_tokenizer = QueryTokenizer()

    def convert(self, text: str) -> str:
        tokens = [t for t in self.query_tokenizer.tokenize(text)]
        if len(tokens) <= 1:
            return text
        joined = ' '.join(tokens)
        return f'"{joined}"'


In [87]:
doc_tokenizer = TokenizeDoc()
phrase_query_converter = PhraseQueryConverter()

df = doc_tokenizer.tokenize(df=df)

In [88]:
indexer = pt.DFIndexer('./askd-terrier', overwrite=True, blocks=True)
indexer.setProperty('tokeniser', 'UTFTokeniser')
indexer.setProperty('termpipelines', '')
index_ref = indexer.index(df['tokens'], docno=df['docno'])
index = pt.IndexFactory.of(index_ref)

pipe = (pt.apply.query(lambda row: phrase_query_converter.convert(row.query)) >> pt.BatchRetrieve(index, wmodel='BM25').compile())
res = pipe.search('検索専門')

Applying 8 rules


In [89]:
# ok
res = pipe.search('検索方法')
res

Unnamed: 0,qid,docid,docno,rank,score,query_0,query
0,1,0,d1,0,-1.584963,検索方法,"""検索 方法"""


In [90]:
# ok
res = pipe.search('検索検討')
res

Unnamed: 0,docid,docno,rank,score,qid,query_0,query


In [91]:
# why hit ?...
res = pipe.search('検索専門')
res

Unnamed: 0,qid,docid,docno,rank,score,query_0,query
0,1,0,d1,0,-1.584963,検索専門,"""検索 専門"""
