diff --git a/src/main/java/kr/co/shineware/nlp/komoran/constant/SYMBOL.java b/src/main/java/kr/co/shineware/nlp/komoran/constant/SYMBOL.java index 14f09de8..0197c321 100644 --- a/src/main/java/kr/co/shineware/nlp/komoran/constant/SYMBOL.java +++ b/src/main/java/kr/co/shineware/nlp/komoran/constant/SYMBOL.java @@ -6,9 +6,9 @@ * Licensed under the Apache License, Version 2.0 (the "License"); * You may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,41 +19,54 @@ /** * komoran에서 사용되는 SYMBOL에 대한 정의
+ * * @author Junsoo Shin * @version 2.1 * @since 2.1 - * */ public class SYMBOL { - - public static final String START = "BOE"; - public static final String END = "EOE"; - public static final String SPACE = ""; - public static final String NA = "NA"; - - public static final String NUMBER = ""; - public static final String SW = "SW"; - public static final String SF = "SF"; - public static final String EC = "EC"; - public static final String EF = "EF"; - public static final String JKO = "JKO"; - public static final String JX = "JX"; - public static final String ETM = "ETM"; - public static final String JKS = "JKS"; - public static final String JKC = "JKC"; - public static final String IRREGULAR = "IRR"; - public static final int IRREGULAR_ID = -1; - public static final String SS = "SS"; - public static final String NNG = "NNG"; - public static final String NNP = "NNP"; - public static final String NNB = "NNB"; - - public static final String JKB = "JKB"; - public static final String VV = "VV"; - public static final String VA = "VA"; - public static final String VX = "VX"; - public static final String VCP = "VCP"; - public static final String VCN = "VCN"; - public static final String NP = "NP"; - public static final String JC = "JC"; + + + private static final String EP = "EP"; + + public static final String START = "BOE"; + public static final String END = "EOE"; + public static final String SPACE = ""; + public static final String NA = "NA"; + + public static final String NUMBER = ""; + public static final String SW = "SW"; + public static final String SF = "SF"; + public static final String EC = "EC"; + public static final String EF = "EF"; + public static final String JKO = "JKO"; + public static final String JX = "JX"; + public static final String ETM = "ETM"; + public static final String JKS = "JKS"; + public static final String JKC = "JKC"; + public static final String IRREGULAR = "IRR"; + public static final int IRREGULAR_ID = -1; + public static final String SS = "SS"; + public static final String NNG = "NNG"; + public static final String NNP = "NNP"; + public static final String NNB = "NNB"; + + public static final String JKB = "JKB"; + public static final String VV = "VV"; + public static final String VA = "VA"; + public static final String VX = "VX"; + public static final String VCP = "VCP"; + public static final String VCN = "VCN"; + public static final String NP = "NP"; + public static final String JC = "JC"; + public static final String JKV = "JKV"; + public static final String JKG = "JKG"; + public static final String NR = "NR"; + public static final String ETN = "ETN"; + + + public static final String[] NOUN = new String[]{SYMBOL.NNG, SYMBOL.NNP, SYMBOL.NNB, SYMBOL.NP, SYMBOL.NR}; + public static final String[] EOMI = new String[]{SYMBOL.EP, SYMBOL.EC, SYMBOL.EF, SYMBOL.ETN, SYMBOL.ETM}; + public static final String[] JOSA = new String[]{SYMBOL.JC, SYMBOL.JKB, SYMBOL.JKC, SYMBOL.JKG, SYMBOL.JKO, SYMBOL.JKS, SYMBOL.JKV, SYMBOL.JX}; + } diff --git a/src/main/java/kr/co/shineware/nlp/komoran/core/model/Lattice.java b/src/main/java/kr/co/shineware/nlp/komoran/core/model/Lattice.java index 113a661e..a121648c 100644 --- a/src/main/java/kr/co/shineware/nlp/komoran/core/model/Lattice.java +++ b/src/main/java/kr/co/shineware/nlp/komoran/core/model/Lattice.java @@ -2,6 +2,10 @@ import kr.co.shineware.ds.aho_corasick.FindContext; import kr.co.shineware.nlp.komoran.constant.SYMBOL; +import kr.co.shineware.nlp.komoran.core.model.combinationrules.CombinationRuleChecker; +import kr.co.shineware.nlp.komoran.core.model.combinationrules.NounEomiCombinationRuleChecker; +import kr.co.shineware.nlp.komoran.core.model.combinationrules.NounJosaCombinationRuleChecker; +import kr.co.shineware.nlp.komoran.core.model.combinationrules.VerbEomiCombinationRuleChecker; import kr.co.shineware.nlp.komoran.model.MorphTag; import kr.co.shineware.nlp.komoran.model.ScoredTag; import kr.co.shineware.nlp.komoran.modeler.model.*; @@ -29,6 +33,8 @@ public class Lattice { private FindContext> irregularFindContext; private FindContext> userDicFindContext; + private List combinationRuleCheckerList; + private double prevMaxScore; private LatticeNode prevMaxNode; private int prevMaxIdx; @@ -47,6 +53,16 @@ public Lattice(Resources resource, Observation userDic, int nbest) { this.init(); this.makeNewContexts(); this.nbest = nbest; + this.registCombinationRuleChecker(); + } + + private void registCombinationRuleChecker() { + MorphUtil morphUtil = new MorphUtil(); + TagUtil tagUtil = new TagUtil(this.getPosTable()); + this.combinationRuleCheckerList = new ArrayList<>(); + this.combinationRuleCheckerList.add(new NounJosaCombinationRuleChecker(morphUtil, tagUtil)); + this.combinationRuleCheckerList.add(new VerbEomiCombinationRuleChecker(morphUtil, tagUtil)); + this.combinationRuleCheckerList.add(new NounEomiCombinationRuleChecker(tagUtil)); } private void setUserDicObservation(Observation userDic) { @@ -272,51 +288,12 @@ private List getNbestMaxTransitionNodeFromPrevNodes( continue; } - //자소 결합규칙 체크 - if (tagId == this.posTable.getId(SYMBOL.JKO)) { - if (this.hasJongsung(prevMorph)) { - if (morph.charAt(0) != 'ㅇ') { - continue; - } - } else { - if (morph.charAt(0) == 'ㅇ') { - continue; - } - } - } else if (tagId == this.posTable.getId(SYMBOL.JKS) - || tagId == this.posTable.getId(SYMBOL.JKC)) { - if (this.hasJongsung(prevMorph)) { - if (morph.charAt(0) == 'ㄱ' && morph.charAt(1) == 'ㅏ') { - continue; - } - } else { - if (morph.charAt(0) == 'ㅇ' && morph.charAt(1) == 'ㅣ') { - continue; - } - } - } else if (tagId == this.posTable.getId(SYMBOL.JKB)) { - if (this.hasJongsung(prevMorph)) { - continue; - } - } else if (tagId == this.posTable.getId(SYMBOL.ETM)) { - if (!this.hasJongsung(prevMorph) && this.isPredicate(prevTagId)) { - if (morph.equals("ㅇㅡㄹ")) { - continue; - } - } - if (this.isNoun(prevTagId)) { - continue; - } - - } else if ( - (tagId == this.posTable.getId(SYMBOL.JX) - || tagId == this.posTable.getId(SYMBOL.JC) - ) && morph.charAt(0) == 'ㅇ') { - if (!this.hasJongsung(prevMorph) && this.isNoun(prevTagId)) { - continue; - } + //결합규칙 체크 + if (!isValidCombination(prevMorph, prevTagId, morph, tagId)) { + continue; } + double prevObservationScore = prevLatticeNode.getScore(); if (nbestPrevNodeList.size() < nbest) { @@ -349,6 +326,15 @@ private List getNbestMaxTransitionNodeFromPrevNodes( return null; } + private boolean isValidCombination(String prevMorph, int prevTagId, String morph, int tagId) { + for (CombinationRuleChecker combinationRuleChecker : this.combinationRuleCheckerList) { + if (!combinationRuleChecker.isValidRule(prevMorph, prevTagId, morph, tagId)) { + return false; + } + } + return true; + } + private boolean isNoun(int prevTagId) { return prevTagId == this.posTable.getId(SYMBOL.NNG) || prevTagId == this.posTable.getId(SYMBOL.NNP) @@ -384,49 +370,9 @@ private LatticeNode getMaxTransitionNodeFromPrevNodes( continue; } - //자소 결합규칙 체크 - if (tagId == this.posTable.getId(SYMBOL.JKO)) { - if (this.hasJongsung(prevMorph)) { - if (morph.charAt(0) != 'ㅇ') { - continue; - } - } else { - if (morph.charAt(0) == 'ㅇ') { - continue; - } - } - } else if (tagId == this.posTable.getId(SYMBOL.JKS) - || tagId == this.posTable.getId(SYMBOL.JKC)) { - if (this.hasJongsung(prevMorph)) { - if (morph.charAt(0) == 'ㄱ' && morph.charAt(1) == 'ㅏ') { - continue; - } - } else { - if (morph.charAt(0) == 'ㅇ' && morph.charAt(1) == 'ㅣ') { - continue; - } - } - } else if (tagId == this.posTable.getId(SYMBOL.JKB)) { - if (this.hasJongsung(prevMorph)) { - continue; - } - } else if (tagId == this.posTable.getId(SYMBOL.ETM)) { - if (!this.hasJongsung(prevMorph) && this.isPredicate(prevTagId)) { - if (morph.equals("ㅇㅡㄹ")) { - continue; - } - } - if (this.isNoun(prevTagId)) { - continue; - } - - } else if ( - (tagId == this.posTable.getId(SYMBOL.JX) - || tagId == this.posTable.getId(SYMBOL.JC) - ) && morph.charAt(0) == 'ㅇ') { - if (!this.hasJongsung(prevMorph) && this.isNoun(prevTagId)) { - continue; - } + //결합규칙 체크 + if (!isValidCombination(prevMorph, prevTagId, morph, tagId)) { + continue; } double prevObservationScore = prevLatticeNode.getScore(); diff --git a/src/main/java/kr/co/shineware/nlp/komoran/core/model/MorphUtil.java b/src/main/java/kr/co/shineware/nlp/komoran/core/model/MorphUtil.java new file mode 100644 index 00000000..a1f68ba2 --- /dev/null +++ b/src/main/java/kr/co/shineware/nlp/komoran/core/model/MorphUtil.java @@ -0,0 +1,31 @@ +package kr.co.shineware.nlp.komoran.core.model; + +import kr.co.shineware.util.common.string.StringUtil; + +public class MorphUtil { + + public MorphUtil() { + ; + } + + public boolean isSameJaso(String sourceMorph, String compareMorph) { + String compareMorphJaso = StringUtil.korean2JasoString(compareMorph); + if (sourceMorph.length() == compareMorphJaso.length()) { + for (int i = 0; i < compareMorphJaso.length(); i++) { + if (sourceMorph.charAt(i) != compareMorphJaso.charAt(i)) { + return false; + } + } + return true; + } + return false; + } + + public boolean hasJongsung(String morph) { + char prevLastJaso = morph.charAt(morph.length() - 1); + if (0x3131 <= prevLastJaso && prevLastJaso <= 0x314e) { + return prevLastJaso != 0x3138 && prevLastJaso != 0x3143 && prevLastJaso != 0x3149; + } + return false; + } +} diff --git a/src/main/java/kr/co/shineware/nlp/komoran/core/model/TagUtil.java b/src/main/java/kr/co/shineware/nlp/komoran/core/model/TagUtil.java new file mode 100644 index 00000000..8c4b2e29 --- /dev/null +++ b/src/main/java/kr/co/shineware/nlp/komoran/core/model/TagUtil.java @@ -0,0 +1,38 @@ +package kr.co.shineware.nlp.komoran.core.model; + +import kr.co.shineware.nlp.komoran.constant.SYMBOL; +import kr.co.shineware.nlp.komoran.modeler.model.PosTable; + +public class TagUtil { + + private final PosTable posTable; + + public TagUtil(PosTable posTable) { + this.posTable = posTable; + } + + public int getId(String tagName) { + return this.posTable.getId(tagName); + } + + private boolean hasTagName(int tagId, String[] symbols) { + for (String tagName : symbols) { + if (tagId == this.posTable.getId(tagName)) { + return true; + } + } + return false; + } + + public boolean isJosa(int tagId) { + return hasTagName(tagId, SYMBOL.JOSA); + } + + public boolean isNoun(int tagId) { + return hasTagName(tagId, SYMBOL.NOUN); + } + + public boolean isEomi(int tagId) { + return hasTagName(tagId, SYMBOL.EOMI); + } +} diff --git a/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/CombinationRuleChecker.java b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/CombinationRuleChecker.java new file mode 100644 index 00000000..61476161 --- /dev/null +++ b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/CombinationRuleChecker.java @@ -0,0 +1,5 @@ +package kr.co.shineware.nlp.komoran.core.model.combinationrules; + +public interface CombinationRuleChecker { + boolean isValidRule(String prevMorph, int prevTagId, String morph, int tagId); +} diff --git a/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/NounEomiCombinationRuleChecker.java b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/NounEomiCombinationRuleChecker.java new file mode 100644 index 00000000..671a0916 --- /dev/null +++ b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/NounEomiCombinationRuleChecker.java @@ -0,0 +1,20 @@ +package kr.co.shineware.nlp.komoran.core.model.combinationrules; + +import kr.co.shineware.nlp.komoran.core.model.TagUtil; + +public class NounEomiCombinationRuleChecker implements CombinationRuleChecker { + + private final TagUtil tagUtil; + + public NounEomiCombinationRuleChecker(TagUtil tagUtil) { + this.tagUtil = tagUtil; + } + + @Override + public boolean isValidRule(String prevMorph, int prevTagId, String morph, int tagId) { + if (this.tagUtil.isNoun(prevTagId) && this.tagUtil.isEomi(tagId)) { + return false; + } + return true; + } +} diff --git a/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/NounJosaCombinationRuleChecker.java b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/NounJosaCombinationRuleChecker.java new file mode 100644 index 00000000..15480936 --- /dev/null +++ b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/NounJosaCombinationRuleChecker.java @@ -0,0 +1,92 @@ +package kr.co.shineware.nlp.komoran.core.model.combinationrules; + +import kr.co.shineware.nlp.komoran.constant.SYMBOL; +import kr.co.shineware.nlp.komoran.core.model.MorphUtil; +import kr.co.shineware.nlp.komoran.core.model.TagUtil; + +public class NounJosaCombinationRuleChecker implements CombinationRuleChecker { + + private final MorphUtil morphUtil; + private final TagUtil tagUtil; + + public NounJosaCombinationRuleChecker(MorphUtil morphUtil, TagUtil tagUtil) { + this.morphUtil = morphUtil; + this.tagUtil = tagUtil; + } + + @Override + public boolean isValidRule(String prevMorph, int prevTagId, String morph, int tagId) { + //이전이 명사류이고 현재가 조사인경우 + if (this.tagUtil.isNoun(prevTagId) && this.tagUtil.isJosa(tagId)) { + + boolean hasJongsung = this.morphUtil.hasJongsung(prevMorph); + + //주격조사 + if (this.tagUtil.getId(SYMBOL.JKS) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "이"); + } else { + return this.morphUtil.isSameJaso(morph, "기"); + } + } + //보격조사 + else if (this.tagUtil.getId(SYMBOL.JKC) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "이"); + } else { + return this.morphUtil.isSameJaso(morph, "가"); + } + } + //목적격조사 + else if (this.tagUtil.getId(SYMBOL.JKO) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "을"); + } else { + return this.morphUtil.isSameJaso(morph, "ㄹ") + || this.morphUtil.isSameJaso(morph, "를"); + } + } + //호격조사 + else if (this.tagUtil.getId(SYMBOL.JKV) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "아"); + } else { + return this.morphUtil.isSameJaso(morph, "야"); + } + } + //접속조사 + else if (this.tagUtil.getId(SYMBOL.JC) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "과") + || this.morphUtil.isSameJaso(morph, "이나") + || this.morphUtil.isSameJaso(morph, "이랑"); + } else { + return this.morphUtil.isSameJaso(morph, "와") + || this.morphUtil.isSameJaso(morph, "나") + || this.morphUtil.isSameJaso(morph, "랑"); + } + } + //부사격조사 + else if (this.tagUtil.getId(SYMBOL.JKB) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "과") + || this.morphUtil.isSameJaso(morph, "으로"); + } else { + return this.morphUtil.isSameJaso(morph, "와") + || this.morphUtil.isSameJaso(morph, "로"); + } + } + //보조사 + else if (this.tagUtil.getId(SYMBOL.JX) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "은") + || this.morphUtil.isSameJaso(morph, "이란"); + } else { + return this.morphUtil.isSameJaso(morph, "는") + || this.morphUtil.isSameJaso(morph, "란"); + } + } + } + return true; + } +} diff --git a/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/VerbEomiCombinationRuleChecker.java b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/VerbEomiCombinationRuleChecker.java new file mode 100644 index 00000000..e91cdea3 --- /dev/null +++ b/src/main/java/kr/co/shineware/nlp/komoran/core/model/combinationrules/VerbEomiCombinationRuleChecker.java @@ -0,0 +1,46 @@ +package kr.co.shineware.nlp.komoran.core.model.combinationrules; + +import kr.co.shineware.nlp.komoran.constant.SYMBOL; +import kr.co.shineware.nlp.komoran.core.model.MorphUtil; +import kr.co.shineware.nlp.komoran.core.model.TagUtil; + +public class VerbEomiCombinationRuleChecker implements CombinationRuleChecker { + + private final MorphUtil morphUtil; + private final TagUtil tagUtil; + + public VerbEomiCombinationRuleChecker(MorphUtil morphUtil, TagUtil tagUtil) { + this.morphUtil = morphUtil; + this.tagUtil = tagUtil; + } + + @Override + public boolean isValidRule(String prevMorph, int prevTagId, String morph, int tagId) { + + //이전이 동사인 경우 + if (this.tagUtil.getId(SYMBOL.VV) == prevTagId) { + boolean hasJongsung = this.morphUtil.hasJongsung(prevMorph); + + //관형형 전성어미 + if (this.tagUtil.getId(SYMBOL.ETM) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "을") + || this.morphUtil.isSameJaso(morph, "은"); + } else { + return this.morphUtil.isSameJaso(morph, "ㄹ") + || this.morphUtil.isSameJaso(morph, "ㄴ"); + } + } + //명사형 전성어미 + else if (this.tagUtil.getId(SYMBOL.ETN) == tagId) { + if (hasJongsung) { + return this.morphUtil.isSameJaso(morph, "음"); + } else { + return this.morphUtil.isSameJaso(morph, "ㅁ"); + } + } + } + + return true; + } +} diff --git a/src/test/java/kr/co/shineware/nlp/komoran/issue/AnalyzeIssues.java b/src/test/java/kr/co/shineware/nlp/komoran/issue/AnalyzeIssues.java index a77553c2..d8e19210 100644 --- a/src/test/java/kr/co/shineware/nlp/komoran/issue/AnalyzeIssues.java +++ b/src/test/java/kr/co/shineware/nlp/komoran/issue/AnalyzeIssues.java @@ -51,4 +51,46 @@ public void issue77() { Assert.assertEquals("황토/NNG 은/NNG", analyzeResult); Assert.assertEquals("황토/NNG 은/NNG", nBestAnalyzeResult); } + + @Test + //https://github.com/shin285/KOMORAN/issues/88 + public void issue88() { + //TODO : 테스트 케이스 작성 + assertNotEqualsOfAnalyzeResult("가위은", "가위/NNG 은/JKS"); + assertNotEqualsOfAnalyzeResult("마늘는", "마늘/NNG 는/JKS"); + + assertNotEqualsOfAnalyzeResult("가위은", "가위/NNG 이/JKC"); + assertNotEqualsOfAnalyzeResult("마늘는", "마늘/NNG 가/JKC"); + + assertNotEqualsOfAnalyzeResult("가위을", "가위/NNG 을/JKO"); + assertNotEqualsOfAnalyzeResult("마늘를", "마늘/NNG 를/JKO"); + + assertNotEqualsOfAnalyzeResult("가위아", "가위/NNG 아/JKV"); + assertNotEqualsOfAnalyzeResult("마늘야", "마늘/NNG 야/JKV"); + + assertNotEqualsOfAnalyzeResult("가위과", "가위/NNG 과/JC"); + assertNotEqualsOfAnalyzeResult("가위이나", "가위/NNG 이나/JC"); + assertNotEqualsOfAnalyzeResult("가위이랑", "가위/NNG 이랑/JC"); + assertNotEqualsOfAnalyzeResult("마늘와", "마늘/NNG 와/JC"); + assertNotEqualsOfAnalyzeResult("마늘나", "마늘/NNG 나/JC"); + assertNotEqualsOfAnalyzeResult("마늘랑", "마늘/NNG 랑/JC"); + + assertNotEqualsOfAnalyzeResult("가위과", "가위/NNG 과/JKB"); + assertNotEqualsOfAnalyzeResult("가위으로", "가위/NNG 으로/JKB"); + assertNotEqualsOfAnalyzeResult("마늘와", "마늘/NNG 와/JKB"); + assertNotEqualsOfAnalyzeResult("마늘로", "마늘/NNG 로/JKB"); + + assertNotEqualsOfAnalyzeResult("가위은", "가위/NNG 은/JX"); + assertNotEqualsOfAnalyzeResult("가위이란", "가위/NNG 이란/JX"); + assertNotEqualsOfAnalyzeResult("마늘는", "마늘/NNG 는/JX"); + assertNotEqualsOfAnalyzeResult("마늘란", "마늘/NNG 란/JX"); + } + + private void assertNotEqualsOfAnalyzeResult(String input, String unexpected) { + String analyzeResult = komoran.analyze(input).getPlainText(); + String nBestAnalyzeResult = komoran.analyze(input, 2).get(0).getPlainText(); + Assert.assertEquals(analyzeResult, nBestAnalyzeResult); + Assert.assertNotEquals(unexpected, analyzeResult); + Assert.assertNotEquals(unexpected, nBestAnalyzeResult); + } }