Skip to content

Commit

Permalink
체언+조사 결합 규칙 및 동사+전성어미 결합 규칙 추가 (#91)
Browse files Browse the repository at this point in the history
* add combination rule checker

* add TODO comment

* Add test case for issue #88 and #90
  • Loading branch information
shin285 committed Feb 9, 2020
1 parent 4411ca7 commit aecc19b
Show file tree
Hide file tree
Showing 9 changed files with 353 additions and 120 deletions.
81 changes: 47 additions & 34 deletions src/main/java/kr/co/shineware/nlp/komoran/constant/SYMBOL.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* You may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -19,41 +19,54 @@

/**
* komoran에서 사용되는 SYMBOL에 대한 정의<br>
*
* @author Junsoo Shin
* @version 2.1
* @since 2.1
*
*/
public class SYMBOL {

public static final String START = "BOE";
public static final String END = "EOE";
public static final String SPACE = "<sp>";
public static final String NA = "NA";

public static final String NUMBER = "<number>";
public static final String SW = "SW";
public static final String SF = "SF";
public static final String EC = "EC";
public static final String EF = "EF";
public static final String JKO = "JKO";
public static final String JX = "JX";
public static final String ETM = "ETM";
public static final String JKS = "JKS";
public static final String JKC = "JKC";
public static final String IRREGULAR = "IRR";
public static final int IRREGULAR_ID = -1;
public static final String SS = "SS";
public static final String NNG = "NNG";
public static final String NNP = "NNP";
public static final String NNB = "NNB";

public static final String JKB = "JKB";
public static final String VV = "VV";
public static final String VA = "VA";
public static final String VX = "VX";
public static final String VCP = "VCP";
public static final String VCN = "VCN";
public static final String NP = "NP";
public static final String JC = "JC";


private static final String EP = "EP";

public static final String START = "BOE";
public static final String END = "EOE";
public static final String SPACE = "<sp>";
public static final String NA = "NA";

public static final String NUMBER = "<number>";
public static final String SW = "SW";
public static final String SF = "SF";
public static final String EC = "EC";
public static final String EF = "EF";
public static final String JKO = "JKO";
public static final String JX = "JX";
public static final String ETM = "ETM";
public static final String JKS = "JKS";
public static final String JKC = "JKC";
public static final String IRREGULAR = "IRR";
public static final int IRREGULAR_ID = -1;
public static final String SS = "SS";
public static final String NNG = "NNG";
public static final String NNP = "NNP";
public static final String NNB = "NNB";

public static final String JKB = "JKB";
public static final String VV = "VV";
public static final String VA = "VA";
public static final String VX = "VX";
public static final String VCP = "VCP";
public static final String VCN = "VCN";
public static final String NP = "NP";
public static final String JC = "JC";
public static final String JKV = "JKV";
public static final String JKG = "JKG";
public static final String NR = "NR";
public static final String ETN = "ETN";


public static final String[] NOUN = new String[]{SYMBOL.NNG, SYMBOL.NNP, SYMBOL.NNB, SYMBOL.NP, SYMBOL.NR};
public static final String[] EOMI = new String[]{SYMBOL.EP, SYMBOL.EC, SYMBOL.EF, SYMBOL.ETN, SYMBOL.ETM};
public static final String[] JOSA = new String[]{SYMBOL.JC, SYMBOL.JKB, SYMBOL.JKC, SYMBOL.JKG, SYMBOL.JKO, SYMBOL.JKS, SYMBOL.JKV, SYMBOL.JX};

}
118 changes: 32 additions & 86 deletions src/main/java/kr/co/shineware/nlp/komoran/core/model/Lattice.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

import kr.co.shineware.ds.aho_corasick.FindContext;
import kr.co.shineware.nlp.komoran.constant.SYMBOL;
import kr.co.shineware.nlp.komoran.core.model.combinationrules.CombinationRuleChecker;
import kr.co.shineware.nlp.komoran.core.model.combinationrules.NounEomiCombinationRuleChecker;
import kr.co.shineware.nlp.komoran.core.model.combinationrules.NounJosaCombinationRuleChecker;
import kr.co.shineware.nlp.komoran.core.model.combinationrules.VerbEomiCombinationRuleChecker;
import kr.co.shineware.nlp.komoran.model.MorphTag;
import kr.co.shineware.nlp.komoran.model.ScoredTag;
import kr.co.shineware.nlp.komoran.modeler.model.*;
Expand Down Expand Up @@ -29,6 +33,8 @@ public class Lattice {
private FindContext<List<IrregularNode>> irregularFindContext;
private FindContext<List<ScoredTag>> userDicFindContext;

private List<CombinationRuleChecker> combinationRuleCheckerList;

private double prevMaxScore;
private LatticeNode prevMaxNode;
private int prevMaxIdx;
Expand All @@ -47,6 +53,16 @@ public Lattice(Resources resource, Observation userDic, int nbest) {
this.init();
this.makeNewContexts();
this.nbest = nbest;
this.registCombinationRuleChecker();
}

private void registCombinationRuleChecker() {
MorphUtil morphUtil = new MorphUtil();
TagUtil tagUtil = new TagUtil(this.getPosTable());
this.combinationRuleCheckerList = new ArrayList<>();
this.combinationRuleCheckerList.add(new NounJosaCombinationRuleChecker(morphUtil, tagUtil));
this.combinationRuleCheckerList.add(new VerbEomiCombinationRuleChecker(morphUtil, tagUtil));
this.combinationRuleCheckerList.add(new NounEomiCombinationRuleChecker(tagUtil));
}

private void setUserDicObservation(Observation userDic) {
Expand Down Expand Up @@ -272,51 +288,12 @@ private List<LatticeNode> getNbestMaxTransitionNodeFromPrevNodes(
continue;
}

//자소 결합규칙 체크
if (tagId == this.posTable.getId(SYMBOL.JKO)) {
if (this.hasJongsung(prevMorph)) {
if (morph.charAt(0) != 'ㅇ') {
continue;
}
} else {
if (morph.charAt(0) == 'ㅇ') {
continue;
}
}
} else if (tagId == this.posTable.getId(SYMBOL.JKS)
|| tagId == this.posTable.getId(SYMBOL.JKC)) {
if (this.hasJongsung(prevMorph)) {
if (morph.charAt(0) == 'ㄱ' && morph.charAt(1) == 'ㅏ') {
continue;
}
} else {
if (morph.charAt(0) == 'ㅇ' && morph.charAt(1) == 'ㅣ') {
continue;
}
}
} else if (tagId == this.posTable.getId(SYMBOL.JKB)) {
if (this.hasJongsung(prevMorph)) {
continue;
}
} else if (tagId == this.posTable.getId(SYMBOL.ETM)) {
if (!this.hasJongsung(prevMorph) && this.isPredicate(prevTagId)) {
if (morph.equals("ㅇㅡㄹ")) {
continue;
}
}
if (this.isNoun(prevTagId)) {
continue;
}

} else if (
(tagId == this.posTable.getId(SYMBOL.JX)
|| tagId == this.posTable.getId(SYMBOL.JC)
) && morph.charAt(0) == 'ㅇ') {
if (!this.hasJongsung(prevMorph) && this.isNoun(prevTagId)) {
continue;
}
//결합규칙 체크
if (!isValidCombination(prevMorph, prevTagId, morph, tagId)) {
continue;
}


double prevObservationScore = prevLatticeNode.getScore();

if (nbestPrevNodeList.size() < nbest) {
Expand Down Expand Up @@ -349,6 +326,15 @@ private List<LatticeNode> getNbestMaxTransitionNodeFromPrevNodes(
return null;
}

private boolean isValidCombination(String prevMorph, int prevTagId, String morph, int tagId) {
for (CombinationRuleChecker combinationRuleChecker : this.combinationRuleCheckerList) {
if (!combinationRuleChecker.isValidRule(prevMorph, prevTagId, morph, tagId)) {
return false;
}
}
return true;
}

private boolean isNoun(int prevTagId) {
return prevTagId == this.posTable.getId(SYMBOL.NNG)
|| prevTagId == this.posTable.getId(SYMBOL.NNP)
Expand Down Expand Up @@ -384,49 +370,9 @@ private LatticeNode getMaxTransitionNodeFromPrevNodes(
continue;
}

//자소 결합규칙 체크
if (tagId == this.posTable.getId(SYMBOL.JKO)) {
if (this.hasJongsung(prevMorph)) {
if (morph.charAt(0) != 'ㅇ') {
continue;
}
} else {
if (morph.charAt(0) == 'ㅇ') {
continue;
}
}
} else if (tagId == this.posTable.getId(SYMBOL.JKS)
|| tagId == this.posTable.getId(SYMBOL.JKC)) {
if (this.hasJongsung(prevMorph)) {
if (morph.charAt(0) == 'ㄱ' && morph.charAt(1) == 'ㅏ') {
continue;
}
} else {
if (morph.charAt(0) == 'ㅇ' && morph.charAt(1) == 'ㅣ') {
continue;
}
}
} else if (tagId == this.posTable.getId(SYMBOL.JKB)) {
if (this.hasJongsung(prevMorph)) {
continue;
}
} else if (tagId == this.posTable.getId(SYMBOL.ETM)) {
if (!this.hasJongsung(prevMorph) && this.isPredicate(prevTagId)) {
if (morph.equals("ㅇㅡㄹ")) {
continue;
}
}
if (this.isNoun(prevTagId)) {
continue;
}

} else if (
(tagId == this.posTable.getId(SYMBOL.JX)
|| tagId == this.posTable.getId(SYMBOL.JC)
) && morph.charAt(0) == 'ㅇ') {
if (!this.hasJongsung(prevMorph) && this.isNoun(prevTagId)) {
continue;
}
//결합규칙 체크
if (!isValidCombination(prevMorph, prevTagId, morph, tagId)) {
continue;
}

double prevObservationScore = prevLatticeNode.getScore();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package kr.co.shineware.nlp.komoran.core.model;

import kr.co.shineware.util.common.string.StringUtil;

public class MorphUtil {

public MorphUtil() {
;
}

public boolean isSameJaso(String sourceMorph, String compareMorph) {
String compareMorphJaso = StringUtil.korean2JasoString(compareMorph);
if (sourceMorph.length() == compareMorphJaso.length()) {
for (int i = 0; i < compareMorphJaso.length(); i++) {
if (sourceMorph.charAt(i) != compareMorphJaso.charAt(i)) {
return false;
}
}
return true;
}
return false;
}

public boolean hasJongsung(String morph) {
char prevLastJaso = morph.charAt(morph.length() - 1);
if (0x3131 <= prevLastJaso && prevLastJaso <= 0x314e) {
return prevLastJaso != 0x3138 && prevLastJaso != 0x3143 && prevLastJaso != 0x3149;
}
return false;
}
}
38 changes: 38 additions & 0 deletions src/main/java/kr/co/shineware/nlp/komoran/core/model/TagUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package kr.co.shineware.nlp.komoran.core.model;

import kr.co.shineware.nlp.komoran.constant.SYMBOL;
import kr.co.shineware.nlp.komoran.modeler.model.PosTable;

public class TagUtil {

private final PosTable posTable;

public TagUtil(PosTable posTable) {
this.posTable = posTable;
}

public int getId(String tagName) {
return this.posTable.getId(tagName);
}

private boolean hasTagName(int tagId, String[] symbols) {
for (String tagName : symbols) {
if (tagId == this.posTable.getId(tagName)) {
return true;
}
}
return false;
}

public boolean isJosa(int tagId) {
return hasTagName(tagId, SYMBOL.JOSA);
}

public boolean isNoun(int tagId) {
return hasTagName(tagId, SYMBOL.NOUN);
}

public boolean isEomi(int tagId) {
return hasTagName(tagId, SYMBOL.EOMI);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package kr.co.shineware.nlp.komoran.core.model.combinationrules;

public interface CombinationRuleChecker {
boolean isValidRule(String prevMorph, int prevTagId, String morph, int tagId);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package kr.co.shineware.nlp.komoran.core.model.combinationrules;

import kr.co.shineware.nlp.komoran.core.model.TagUtil;

public class NounEomiCombinationRuleChecker implements CombinationRuleChecker {

private final TagUtil tagUtil;

public NounEomiCombinationRuleChecker(TagUtil tagUtil) {
this.tagUtil = tagUtil;
}

@Override
public boolean isValidRule(String prevMorph, int prevTagId, String morph, int tagId) {
if (this.tagUtil.isNoun(prevTagId) && this.tagUtil.isEomi(tagId)) {
return false;
}
return true;
}
}

0 comments on commit aecc19b

Please sign in to comment.