Elasticsearch analyzer 플러그인 지원 (#121)

* Add elasticsearch plugin module and initial commit * add elasticsearch plugin * Update lucene library for elasticsearch plugin * Fix the core's bug and build elasticsearch analyzer plugin for version 7.3.2
shineware · May 1, 2020 · c3f1c66 · c3f1c66
1 parent c0058ab
commit c3f1c66
Show file tree

Hide file tree

Showing 10 changed files with 280 additions and 2 deletions.
diff --git a/core/src/main/java/kr/co/shineware/nlp/komoran/core/Komoran.java b/core/src/main/java/kr/co/shineware/nlp/komoran/core/Komoran.java
@@ -245,9 +245,9 @@ private ScoredTag getScoredTag(String jasoUnits, FindContext<List<ScoredTag>> fi
      */
     public List<KomoranResult> analyze(String sentence, int nbest) {
 
-        if(sentence.length() == 0){
+        if(sentence == null || sentence.length() == 0){
             return new ArrayList<>(
-                    Collections.singletonList(new KomoranResult(new ArrayList<>(), sentence))
+                    Collections.singletonList(new KomoranResult(new ArrayList<>(), ""))
             );
         }
 

diff --git a/core/src/test/java/kr/co/shineware/nlp/komoran/issue/AnalyzeIssues.java b/core/src/test/java/kr/co/shineware/nlp/komoran/issue/AnalyzeIssues.java
@@ -34,6 +34,12 @@ public void issue119() {
         Assert.assertEquals("", komoranResult.getPlainText());
         Assert.assertEquals(0, komoranResult.getResultNodeList().size());
         Assert.assertEquals(0, komoranResult.getList().size());
+
+        komoranResult = this.komoran.analyze(null);
+        Assert.assertEquals(0, komoranResult.getTokenList().size());
+        Assert.assertEquals("", komoranResult.getPlainText());
+        Assert.assertEquals(0, komoranResult.getResultNodeList().size());
+        Assert.assertEquals(0, komoranResult.getList().size());
     }
 
     @Test

diff --git a/elasticsearch-plugin/build.gradle b/elasticsearch-plugin/build.gradle
@@ -0,0 +1,66 @@
+plugins {
+    id 'java'
+}
+
+group 'shineware'
+version '1.0-SNAPSHOT'
+
+sourceCompatibility = 1.8
+
+repositories {
+    mavenCentral()
+    maven { url 'https://jitpack.io' }
+}
+
+configurations {
+    includeCompile
+}
+
+project.ext {
+    elist = new HashSet<File>()
+}
+
+task makePlugin(type: Zip) {
+    println "Making Elasticsearch Plugin"
+    dependsOn(":elasticsearch-plugin:build")
+    from {
+        configurations.includeCompile.collect {
+            if (it.isDirectory()) {
+                return it
+            } else {
+                if (!project.elist.contains(it)) {
+                    project.elist.add(it)
+                    if (it.name == "core.jar" || it.name == "commons-1.0.1.jar" || it.name == "aho-corasick-1.1.0.jar") {
+                        include it.name
+                        println "Archiving "+it.name
+                    }
+                }
+                return it
+            }
+        }
+    }
+
+    from("build/libs/") {
+        include "*"
+    }
+    from("src/main/resources/") {
+        include "*"
+    }
+    archiveName('komoran-tokenizer.zip')
+    destinationDir(file('.'))
+}
+
+
+dependencies {
+    includeCompile 'com.github.shineware:commons:1.0.1'
+    includeCompile 'com.github.shineware:aho-corasick:1.1.0'
+    testCompile group: 'junit', name: 'junit', version: '4.12'
+    compile group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: '8.4.0'
+    testCompile group: 'org.apache.lucene', name: 'lucene-test-framework', version: '8.4.0'
+    compile group: 'org.elasticsearch', name: 'elasticsearch', version: '7.6.2'
+    includeCompile project(':core')
+    configurations.compile.extendsFrom(configurations.includeCompile)
+}
+
+compileJava.options.encoding = "UTF-8"
+compileTestJava.options.encoding = "UTF-8"
diff --git a/elasticsearch-plugin/komoran-tokenizer.zip b/elasticsearch-plugin/komoran-tokenizer.zip
diff --git a/...plugin/src/main/java/kr/co/shineware/nlp/elasticsearch/index/KomoranTokenizerFactory.java b/...plugin/src/main/java/kr/co/shineware/nlp/elasticsearch/index/KomoranTokenizerFactory.java
@@ -0,0 +1,22 @@
+package kr.co.shineware.nlp.elasticsearch.index;
+
+import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL;
+import kr.co.shineware.nlp.komoran.core.Komoran;
+import kr.co.shineware.nlp.lucene.tokenizer.KomoranTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
+
+public class KomoranTokenizerFactory extends AbstractTokenizerFactory {
+
+    public KomoranTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, settings, name);
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE));
+    }
+}
diff --git a/...icsearch-plugin/src/main/java/kr/co/shineware/nlp/elasticsearch/plugin/KomoranPlugin.java b/...icsearch-plugin/src/main/java/kr/co/shineware/nlp/elasticsearch/plugin/KomoranPlugin.java
@@ -0,0 +1,20 @@
+package kr.co.shineware.nlp.elasticsearch.plugin;
+
+import kr.co.shineware.nlp.elasticsearch.index.KomoranTokenizerFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+
+import java.util.Map;
+
+import static java.util.Collections.singletonMap;
+
+public class KomoranPlugin extends Plugin implements AnalysisPlugin {
+
+    @Override
+    public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        return singletonMap("komoran-tokenizer", KomoranTokenizerFactory::new);
+    }
+
+}
diff --git a/...ticsearch-plugin/src/main/java/kr/co/shineware/nlp/lucene/tokenizer/KomoranTokenizer.java b/...ticsearch-plugin/src/main/java/kr/co/shineware/nlp/lucene/tokenizer/KomoranTokenizer.java
@@ -0,0 +1,90 @@
+package kr.co.shineware.nlp.lucene.tokenizer;
+
+import kr.co.shineware.nlp.komoran.core.Komoran;
+import kr.co.shineware.nlp.komoran.model.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Iterator;
+
+public class KomoranTokenizer extends Tokenizer {
+
+    private Komoran komoran;
+    private String buffer;
+    private Iterator<Token> tokenIterator;
+
+    private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
+    private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
+    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+    private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+    private int lastOffset;
+
+    public KomoranTokenizer(Komoran komoran) {
+        this.komoran = komoran;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+        clearAttributes();
+        setupInputBuffer();
+        analyze();
+        return consumeResult();
+    }
+
+    private boolean consumeResult() {
+        if (tokenIterator.hasNext()) {
+            Token token = tokenIterator.next();
+            this.charTermAttribute.setEmpty().append(token.getMorph());
+            this.positionIncrementAttribute.setPositionIncrement(1);
+            this.offsetAttribute.setOffset(
+                    correctOffset(token.getBeginIndex()), correctOffset(token.getEndIndex())
+            );
+            this.typeAttribute.setType(token.getPos());
+            return true;
+        }
+        lastOffset = buffer.length();
+        tokenIterator = null;
+        buffer = null;
+        return false;
+    }
+
+    private void analyze() {
+        if (tokenIterator == null) {
+            tokenIterator = komoran.analyze(buffer).getTokenList().iterator();
+        }
+    }
+
+    private void setupInputBuffer() throws IOException {
+        if (tokenIterator == null) {
+            BufferedReader br = new BufferedReader(input);
+            buffer = br.readLine();
+            br.close();
+        }
+        if(buffer == null){
+            buffer = "";
+        }
+    }
+
+    @Override
+    public void end() throws IOException {
+        super.end();
+        offsetAttribute.setOffset(correctOffset(lastOffset), correctOffset(lastOffset));
+    }
+
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        tokenIterator = null;
+        buffer = null;
+    }
+
+    @Override
+    public void close() throws IOException {
+        super.close();
+    }
+}
diff --git a/elasticsearch-plugin/src/main/resources/plugin-descriptor.properties b/elasticsearch-plugin/src/main/resources/plugin-descriptor.properties
@@ -0,0 +1,46 @@
+# Elasticsearch plugin descriptor file
+# This file must exist as 'plugin-descriptor.properties' inside a plugin.
+#
+### example plugin for "foo"
+#
+# foo.zip <-- zip file for the plugin, with this structure:
+# |____   <arbitrary name1>.jar <-- classes, resources, dependencies
+# |____   <arbitrary nameN>.jar <-- any number of jars
+# |____   plugin-descriptor.properties <-- example contents below:
+#
+# classname=foo.bar.BazPlugin
+# description=My cool plugin
+# version=6.0
+# elasticsearch.version=6.0
+# java.version=1.8
+#
+### mandatory elements for all plugins:
+#
+# 'description': simple summary of the plugin
+description='komoran tokenizer for elasticsearch'
+#
+# 'version': plugin's version
+version=4.0.0
+#
+# 'name': the plugin name
+name=komoran-tokenizer
+#
+
+# 'classname': the name of the class to load, fully-qualified.
+classname=kr.co.shineware.nlp.elasticsearch.plugin.KomoranPlugin
+#
+# 'java.version': version of java the code is built against
+# use the system property java.specification.version
+# version string must be a sequence of nonnegative decimal integers
+# separated by "."'s and may have leading zeros
+java.version=1.8
+#
+# 'elasticsearch.version': version of elasticsearch compiled against
+elasticsearch.version=7.6.2
+### optional elements for plugins:
+#
+#  'extended.plugins': other plugins this plugin extends through SPI
+#extended.plugins=${extendedPlugins}
+#
+# 'has.native.controller': whether or not the plugin has a native controller
+#has.native.controller=${hasNativeController}
diff --git a/elasticsearch-plugin/src/test/java/TokenizerRunTest.java b/elasticsearch-plugin/src/test/java/TokenizerRunTest.java
@@ -0,0 +1,27 @@
+import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL;
+import kr.co.shineware.nlp.komoran.core.Komoran;
+import kr.co.shineware.nlp.lucene.tokenizer.KomoranTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.junit.Test;
+
+import java.io.StringReader;
+
+public class TokenizerRunTest {
+    @Test
+    public void analyzeTest() throws Exception{
+        String testSource = "저는 이번에 바람과 함께 사라지다를 봤어요";
+        KomoranTokenizer tokenStream = new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE));
+        tokenStream.setReader(new StringReader(testSource));
+        tokenStream.reset();
+        while (tokenStream.incrementToken()) {
+            System.out.println(tokenStream.getAttribute(CharTermAttribute.class));
+            System.out.println(tokenStream.getAttribute(TypeAttribute.class).type());
+            System.out.println(tokenStream.getAttribute(OffsetAttribute.class).startOffset()
+                    +" : "+tokenStream.getAttribute(OffsetAttribute.class).endOffset());
+        }
+        tokenStream.end();
+        tokenStream.close();
+    }
+}
diff --git a/settings.gradle b/settings.gradle
@@ -19,4 +19,5 @@ include 'services:webservice'
 rootProject.name = 'KOMORAN'
 include 'core'
 include 'admin'
+include 'elasticsearch-plugin'