In [4]:
from whoosh.index import create_in
from whoosh.fields import *
import os
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

if not os.path.exists("out/index"):
    os.mkdir("out/index")
ix = create_in("out/indexdir", schema)
writer = ix.writer()

In [6]:
from whoosh.qparser import QueryParser

writer.add_document(title=u"First document", path=u"/a",
                    content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", path=u"/b",
                    content=u"The second one is even more interesting!")
writer.commit()

with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    print(results[0])


<Hit {'path': '/a', 'title': 'First document'}>


In [7]:
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("first")
    results = searcher.search(query)
    print(results[0])

<Hit {'path': '/a', 'title': 'First document'}>


In [10]:
from __future__ import unicode_literals
from jieba.analyse import ChineseAnalyzer

analyzer = ChineseAnalyzer()

schema = Schema(title=TEXT(stored=True), path=ID(stored=True), 
                content=TEXT(stored=True, analyzer=analyzer))

## rewrite mode
if not os.path.exists("out/test"):
    os.mkdir("out/test")
idx = create_in("out/test", schema)

writer = idx.writer()
writer.add_document(
    title="first test-document",
    path="/c",
    content="This is the document for test, 水果和米饭."
)
writer.commit()
searcher = idx.searcher()
parser = QueryParser("content", schema=idx.schema)

for keyword in ("水果","你","first", 'test',"中文","交换机","交换"):
    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print(hit.highlights("content"))
    print("="*10)

result of  水果
document for test, <b class="match term0">水果</b>和米饭
result of  你
result of  first
result of  test
document for <b class="match term0">test</b>, 水果和米饭
result of  中文
result of  交换机
result of  交换


In [8]:
from __future__ import unicode_literals

from whoosh.index import open_dir
from whoosh.index import create_in
from whoosh.fields import *

from jieba.analyse import ChineseAnalyzer
from whoosh.qparser import QueryParser

analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))

## append mode
idx = open_dir("out/test")

writer = idx.writer()
writer.add_document(
    title="test-document-2",
    path="/b",
    content="This is the document for test, 水果和大蒜."
)
writer.commit()
searcher = idx.searcher()
parser = QueryParser("content", schema=idx.schema)

for keyword in ("水果","你","first", 'test',"中文","交换机","交换"):
    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print(hit.highlights("content"))
    print("="*10)

result of  水果
document for test, <b class="match term0">水果</b>和米饭
document for test, <b class="match term0">水果</b>和大蒜
result of  你
result of  first
result of  test
document for <b class="match term0">test</b>, 水果和米饭
document for <b class="match term0">test</b>, 水果和大蒜
result of  中文
result of  交换机
result of  交换


In [12]:
from __future__ import unicode_literals
from jieba.analyse import ChineseAnalyzer
from sagas.ofbiz.resources import ResourceDigester

rd=ResourceDigester()
resource=rd.process_resource(xml_file='data/i18n/SagasUiLabels.xml')

analyzer = ChineseAnalyzer()
schema = Schema(en=TEXT(stored=True), 
                fr=TEXT(stored=True),
                key=ID(stored=True), 
                zh=TEXT(stored=True, analyzer=analyzer))

## rewrite mode
out_dir='out/labels'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
idx = create_in(out_dir, schema)

writer = idx.writer()

for key, prop in resource.properties.items():    
    writer.add_document(
        key=key,
        en=prop.values['en'],
        zh=prop.values['zh'],
        fr=prop.values['fr']
    )

writer.commit()

searcher = idx.searcher()
parser = QueryParser("zh", schema=idx.schema)

for keyword in ("中文","组成部分","交换"):
    print("result of ",keyword)
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print(hit.highlights("zh"))
    print("="*10)

- SagasUiLabels.xml
SagasApplication ⊕ SagasUiLabels.xml
	 value Sagas Application en
	 value Sagas应用程序 zh
	 value Sagas應用程式 zh-TW
SagasCompanyName ⊕ SagasUiLabels.xml
	 value OFBiz: Sagas en
	 value OFBiz: Sagas zh-TW
SagasCompanySubtitle ⊕ SagasUiLabels.xml
	 value Part of the Apache OFBiz Family of Open Source Software en
	 value Un modulo della famiglia di software open source Apache OFBiz it
	 value 开源软件OFBiz的组成部分 zh
	 value 開源軟體OFBiz的組成部分 zh-TW
SagasViewPermissionError ⊕ SagasUiLabels.xml
	 value You are not allowed to view this page. en
	 value 不允许你浏览这个页面。 zh
	 value 不允許您檢視這個頁面. zh-TW
result of  中文
result of  组成部分
开源软件OFBiz的<b class="match term0">组成部分</b>
result of  交换
