Skip to content

Commit

Permalink
Elasticsearch analyzer 플러그인 지원 (#121)
Browse files Browse the repository at this point in the history
* Add elasticsearch plugin module and initial commit

* add elasticsearch plugin

* Update lucene library for elasticsearch plugin

* Fix the core's bug and build elasticsearch analyzer plugin for version 7.3.2
  • Loading branch information
shin285 committed May 1, 2020
1 parent c0058ab commit c3f1c66
Show file tree
Hide file tree
Showing 10 changed files with 280 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ private ScoredTag getScoredTag(String jasoUnits, FindContext<List<ScoredTag>> fi
*/
public List<KomoranResult> analyze(String sentence, int nbest) {

if(sentence.length() == 0){
if(sentence == null || sentence.length() == 0){
return new ArrayList<>(
Collections.singletonList(new KomoranResult(new ArrayList<>(), sentence))
Collections.singletonList(new KomoranResult(new ArrayList<>(), ""))
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ public void issue119() {
Assert.assertEquals("", komoranResult.getPlainText());
Assert.assertEquals(0, komoranResult.getResultNodeList().size());
Assert.assertEquals(0, komoranResult.getList().size());

komoranResult = this.komoran.analyze(null);
Assert.assertEquals(0, komoranResult.getTokenList().size());
Assert.assertEquals("", komoranResult.getPlainText());
Assert.assertEquals(0, komoranResult.getResultNodeList().size());
Assert.assertEquals(0, komoranResult.getList().size());
}

@Test
Expand Down
66 changes: 66 additions & 0 deletions elasticsearch-plugin/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
plugins {
id 'java'
}

group 'shineware'
version '1.0-SNAPSHOT'

sourceCompatibility = 1.8

repositories {
mavenCentral()
maven { url 'https://jitpack.io' }
}

configurations {
includeCompile
}

project.ext {
elist = new HashSet<File>()
}

task makePlugin(type: Zip) {
println "Making Elasticsearch Plugin"
dependsOn(":elasticsearch-plugin:build")
from {
configurations.includeCompile.collect {
if (it.isDirectory()) {
return it
} else {
if (!project.elist.contains(it)) {
project.elist.add(it)
if (it.name == "core.jar" || it.name == "commons-1.0.1.jar" || it.name == "aho-corasick-1.1.0.jar") {
include it.name
println "Archiving "+it.name
}
}
return it
}
}
}

from("build/libs/") {
include "*"
}
from("src/main/resources/") {
include "*"
}
archiveName('komoran-tokenizer.zip')
destinationDir(file('.'))
}


dependencies {
includeCompile 'com.github.shineware:commons:1.0.1'
includeCompile 'com.github.shineware:aho-corasick:1.1.0'
testCompile group: 'junit', name: 'junit', version: '4.12'
compile group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: '8.4.0'
testCompile group: 'org.apache.lucene', name: 'lucene-test-framework', version: '8.4.0'
compile group: 'org.elasticsearch', name: 'elasticsearch', version: '7.6.2'
includeCompile project(':core')
configurations.compile.extendsFrom(configurations.includeCompile)
}

compileJava.options.encoding = "UTF-8"
compileTestJava.options.encoding = "UTF-8"
Binary file added elasticsearch-plugin/komoran-tokenizer.zip
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package kr.co.shineware.nlp.elasticsearch.index;

import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL;
import kr.co.shineware.nlp.komoran.core.Komoran;
import kr.co.shineware.nlp.lucene.tokenizer.KomoranTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

public class KomoranTokenizerFactory extends AbstractTokenizerFactory {

public KomoranTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, settings, name);
}

@Override
public Tokenizer create() {
return new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package kr.co.shineware.nlp.elasticsearch.plugin;

import kr.co.shineware.nlp.elasticsearch.index.KomoranTokenizerFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;

import java.util.Map;

import static java.util.Collections.singletonMap;

public class KomoranPlugin extends Plugin implements AnalysisPlugin {

@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("komoran-tokenizer", KomoranTokenizerFactory::new);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package kr.co.shineware.nlp.lucene.tokenizer;

import kr.co.shineware.nlp.komoran.core.Komoran;
import kr.co.shineware.nlp.komoran.model.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Iterator;

public class KomoranTokenizer extends Tokenizer {

private Komoran komoran;
private String buffer;
private Iterator<Token> tokenIterator;

private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private int lastOffset;

public KomoranTokenizer(Komoran komoran) {
this.komoran = komoran;
}

@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
setupInputBuffer();
analyze();
return consumeResult();
}

private boolean consumeResult() {
if (tokenIterator.hasNext()) {
Token token = tokenIterator.next();
this.charTermAttribute.setEmpty().append(token.getMorph());
this.positionIncrementAttribute.setPositionIncrement(1);
this.offsetAttribute.setOffset(
correctOffset(token.getBeginIndex()), correctOffset(token.getEndIndex())
);
this.typeAttribute.setType(token.getPos());
return true;
}
lastOffset = buffer.length();
tokenIterator = null;
buffer = null;
return false;
}

private void analyze() {
if (tokenIterator == null) {
tokenIterator = komoran.analyze(buffer).getTokenList().iterator();
}
}

private void setupInputBuffer() throws IOException {
if (tokenIterator == null) {
BufferedReader br = new BufferedReader(input);
buffer = br.readLine();
br.close();
}
if(buffer == null){
buffer = "";
}
}

@Override
public void end() throws IOException {
super.end();
offsetAttribute.setOffset(correctOffset(lastOffset), correctOffset(lastOffset));
}

@Override
public void reset() throws IOException {
super.reset();
tokenIterator = null;
buffer = null;
}

@Override
public void close() throws IOException {
super.close();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Elasticsearch plugin descriptor file
# This file must exist as 'plugin-descriptor.properties' inside a plugin.
#
### example plugin for "foo"
#
# foo.zip <-- zip file for the plugin, with this structure:
# |____ <arbitrary name1>.jar <-- classes, resources, dependencies
# |____ <arbitrary nameN>.jar <-- any number of jars
# |____ plugin-descriptor.properties <-- example contents below:
#
# classname=foo.bar.BazPlugin
# description=My cool plugin
# version=6.0
# elasticsearch.version=6.0
# java.version=1.8
#
### mandatory elements for all plugins:
#
# 'description': simple summary of the plugin
description='komoran tokenizer for elasticsearch'
#
# 'version': plugin's version
version=4.0.0
#
# 'name': the plugin name
name=komoran-tokenizer
#

# 'classname': the name of the class to load, fully-qualified.
classname=kr.co.shineware.nlp.elasticsearch.plugin.KomoranPlugin
#
# 'java.version': version of java the code is built against
# use the system property java.specification.version
# version string must be a sequence of nonnegative decimal integers
# separated by "."'s and may have leading zeros
java.version=1.8
#
# 'elasticsearch.version': version of elasticsearch compiled against
elasticsearch.version=7.6.2
### optional elements for plugins:
#
# 'extended.plugins': other plugins this plugin extends through SPI
#extended.plugins=${extendedPlugins}
#
# 'has.native.controller': whether or not the plugin has a native controller
#has.native.controller=${hasNativeController}
27 changes: 27 additions & 0 deletions elasticsearch-plugin/src/test/java/TokenizerRunTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL;
import kr.co.shineware.nlp.komoran.core.Komoran;
import kr.co.shineware.nlp.lucene.tokenizer.KomoranTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.junit.Test;

import java.io.StringReader;

public class TokenizerRunTest {
@Test
public void analyzeTest() throws Exception{
String testSource = "저는 이번에 바람과 함께 사라지다를 봤어요";
KomoranTokenizer tokenStream = new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE));
tokenStream.setReader(new StringReader(testSource));
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(tokenStream.getAttribute(CharTermAttribute.class));
System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());
System.out.println(tokenStream.getAttribute(OffsetAttribute.class).startOffset()
+" : "+tokenStream.getAttribute(OffsetAttribute.class).endOffset());
}
tokenStream.end();
tokenStream.close();
}
}
1 change: 1 addition & 0 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ include 'services:webservice'
rootProject.name = 'KOMORAN'
include 'core'
include 'admin'
include 'elasticsearch-plugin'

0 comments on commit c3f1c66

Please sign in to comment.