-
-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Elasticsearch analyzer 플러그인 지원 (#121)
* Add elasticsearch plugin module and initial commit * add elasticsearch plugin * Update lucene library for elasticsearch plugin * Fix the core's bug and build elasticsearch analyzer plugin for version 7.3.2
- Loading branch information
Showing
10 changed files
with
280 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
plugins { | ||
id 'java' | ||
} | ||
|
||
group 'shineware' | ||
version '1.0-SNAPSHOT' | ||
|
||
sourceCompatibility = 1.8 | ||
|
||
repositories { | ||
mavenCentral() | ||
maven { url 'https://jitpack.io' } | ||
} | ||
|
||
configurations { | ||
includeCompile | ||
} | ||
|
||
project.ext { | ||
elist = new HashSet<File>() | ||
} | ||
|
||
task makePlugin(type: Zip) { | ||
println "Making Elasticsearch Plugin" | ||
dependsOn(":elasticsearch-plugin:build") | ||
from { | ||
configurations.includeCompile.collect { | ||
if (it.isDirectory()) { | ||
return it | ||
} else { | ||
if (!project.elist.contains(it)) { | ||
project.elist.add(it) | ||
if (it.name == "core.jar" || it.name == "commons-1.0.1.jar" || it.name == "aho-corasick-1.1.0.jar") { | ||
include it.name | ||
println "Archiving "+it.name | ||
} | ||
} | ||
return it | ||
} | ||
} | ||
} | ||
|
||
from("build/libs/") { | ||
include "*" | ||
} | ||
from("src/main/resources/") { | ||
include "*" | ||
} | ||
archiveName('komoran-tokenizer.zip') | ||
destinationDir(file('.')) | ||
} | ||
|
||
|
||
dependencies { | ||
includeCompile 'com.github.shineware:commons:1.0.1' | ||
includeCompile 'com.github.shineware:aho-corasick:1.1.0' | ||
testCompile group: 'junit', name: 'junit', version: '4.12' | ||
compile group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: '8.4.0' | ||
testCompile group: 'org.apache.lucene', name: 'lucene-test-framework', version: '8.4.0' | ||
compile group: 'org.elasticsearch', name: 'elasticsearch', version: '7.6.2' | ||
includeCompile project(':core') | ||
configurations.compile.extendsFrom(configurations.includeCompile) | ||
} | ||
|
||
compileJava.options.encoding = "UTF-8" | ||
compileTestJava.options.encoding = "UTF-8" |
Binary file not shown.
22 changes: 22 additions & 0 deletions
22
...plugin/src/main/java/kr/co/shineware/nlp/elasticsearch/index/KomoranTokenizerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package kr.co.shineware.nlp.elasticsearch.index; | ||
|
||
import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL; | ||
import kr.co.shineware.nlp.komoran.core.Komoran; | ||
import kr.co.shineware.nlp.lucene.tokenizer.KomoranTokenizer; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.env.Environment; | ||
import org.elasticsearch.index.IndexSettings; | ||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory; | ||
|
||
public class KomoranTokenizerFactory extends AbstractTokenizerFactory { | ||
|
||
public KomoranTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { | ||
super(indexSettings, settings, name); | ||
} | ||
|
||
@Override | ||
public Tokenizer create() { | ||
return new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE)); | ||
} | ||
} |
20 changes: 20 additions & 0 deletions
20
...icsearch-plugin/src/main/java/kr/co/shineware/nlp/elasticsearch/plugin/KomoranPlugin.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package kr.co.shineware.nlp.elasticsearch.plugin; | ||
|
||
import kr.co.shineware.nlp.elasticsearch.index.KomoranTokenizerFactory; | ||
import org.elasticsearch.index.analysis.TokenizerFactory; | ||
import org.elasticsearch.indices.analysis.AnalysisModule; | ||
import org.elasticsearch.plugins.AnalysisPlugin; | ||
import org.elasticsearch.plugins.Plugin; | ||
|
||
import java.util.Map; | ||
|
||
import static java.util.Collections.singletonMap; | ||
|
||
public class KomoranPlugin extends Plugin implements AnalysisPlugin { | ||
|
||
@Override | ||
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() { | ||
return singletonMap("komoran-tokenizer", KomoranTokenizerFactory::new); | ||
} | ||
|
||
} |
90 changes: 90 additions & 0 deletions
90
...ticsearch-plugin/src/main/java/kr/co/shineware/nlp/lucene/tokenizer/KomoranTokenizer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
package kr.co.shineware.nlp.lucene.tokenizer; | ||
|
||
import kr.co.shineware.nlp.komoran.core.Komoran; | ||
import kr.co.shineware.nlp.komoran.model.Token; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.util.Iterator; | ||
|
||
public class KomoranTokenizer extends Tokenizer { | ||
|
||
private Komoran komoran; | ||
private String buffer; | ||
private Iterator<Token> tokenIterator; | ||
|
||
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); | ||
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); | ||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); | ||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); | ||
private int lastOffset; | ||
|
||
public KomoranTokenizer(Komoran komoran) { | ||
this.komoran = komoran; | ||
} | ||
|
||
@Override | ||
public final boolean incrementToken() throws IOException { | ||
clearAttributes(); | ||
setupInputBuffer(); | ||
analyze(); | ||
return consumeResult(); | ||
} | ||
|
||
private boolean consumeResult() { | ||
if (tokenIterator.hasNext()) { | ||
Token token = tokenIterator.next(); | ||
this.charTermAttribute.setEmpty().append(token.getMorph()); | ||
this.positionIncrementAttribute.setPositionIncrement(1); | ||
this.offsetAttribute.setOffset( | ||
correctOffset(token.getBeginIndex()), correctOffset(token.getEndIndex()) | ||
); | ||
this.typeAttribute.setType(token.getPos()); | ||
return true; | ||
} | ||
lastOffset = buffer.length(); | ||
tokenIterator = null; | ||
buffer = null; | ||
return false; | ||
} | ||
|
||
private void analyze() { | ||
if (tokenIterator == null) { | ||
tokenIterator = komoran.analyze(buffer).getTokenList().iterator(); | ||
} | ||
} | ||
|
||
private void setupInputBuffer() throws IOException { | ||
if (tokenIterator == null) { | ||
BufferedReader br = new BufferedReader(input); | ||
buffer = br.readLine(); | ||
br.close(); | ||
} | ||
if(buffer == null){ | ||
buffer = ""; | ||
} | ||
} | ||
|
||
@Override | ||
public void end() throws IOException { | ||
super.end(); | ||
offsetAttribute.setOffset(correctOffset(lastOffset), correctOffset(lastOffset)); | ||
} | ||
|
||
@Override | ||
public void reset() throws IOException { | ||
super.reset(); | ||
tokenIterator = null; | ||
buffer = null; | ||
} | ||
|
||
@Override | ||
public void close() throws IOException { | ||
super.close(); | ||
} | ||
} |
46 changes: 46 additions & 0 deletions
46
elasticsearch-plugin/src/main/resources/plugin-descriptor.properties
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Elasticsearch plugin descriptor file | ||
# This file must exist as 'plugin-descriptor.properties' inside a plugin. | ||
# | ||
### example plugin for "foo" | ||
# | ||
# foo.zip <-- zip file for the plugin, with this structure: | ||
# |____ <arbitrary name1>.jar <-- classes, resources, dependencies | ||
# |____ <arbitrary nameN>.jar <-- any number of jars | ||
# |____ plugin-descriptor.properties <-- example contents below: | ||
# | ||
# classname=foo.bar.BazPlugin | ||
# description=My cool plugin | ||
# version=6.0 | ||
# elasticsearch.version=6.0 | ||
# java.version=1.8 | ||
# | ||
### mandatory elements for all plugins: | ||
# | ||
# 'description': simple summary of the plugin | ||
description='komoran tokenizer for elasticsearch' | ||
# | ||
# 'version': plugin's version | ||
version=4.0.0 | ||
# | ||
# 'name': the plugin name | ||
name=komoran-tokenizer | ||
# | ||
|
||
# 'classname': the name of the class to load, fully-qualified. | ||
classname=kr.co.shineware.nlp.elasticsearch.plugin.KomoranPlugin | ||
# | ||
# 'java.version': version of java the code is built against | ||
# use the system property java.specification.version | ||
# version string must be a sequence of nonnegative decimal integers | ||
# separated by "."'s and may have leading zeros | ||
java.version=1.8 | ||
# | ||
# 'elasticsearch.version': version of elasticsearch compiled against | ||
elasticsearch.version=7.6.2 | ||
### optional elements for plugins: | ||
# | ||
# 'extended.plugins': other plugins this plugin extends through SPI | ||
#extended.plugins=${extendedPlugins} | ||
# | ||
# 'has.native.controller': whether or not the plugin has a native controller | ||
#has.native.controller=${hasNativeController} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL; | ||
import kr.co.shineware.nlp.komoran.core.Komoran; | ||
import kr.co.shineware.nlp.lucene.tokenizer.KomoranTokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; | ||
import org.junit.Test; | ||
|
||
import java.io.StringReader; | ||
|
||
public class TokenizerRunTest { | ||
@Test | ||
public void analyzeTest() throws Exception{ | ||
String testSource = "저는 이번에 바람과 함께 사라지다를 봤어요"; | ||
KomoranTokenizer tokenStream = new KomoranTokenizer(new Komoran(DEFAULT_MODEL.STABLE)); | ||
tokenStream.setReader(new StringReader(testSource)); | ||
tokenStream.reset(); | ||
while (tokenStream.incrementToken()) { | ||
System.out.println(tokenStream.getAttribute(CharTermAttribute.class)); | ||
System.out.println(tokenStream.getAttribute(TypeAttribute.class).type()); | ||
System.out.println(tokenStream.getAttribute(OffsetAttribute.class).startOffset() | ||
+" : "+tokenStream.getAttribute(OffsetAttribute.class).endOffset()); | ||
} | ||
tokenStream.end(); | ||
tokenStream.close(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters