In [1]:
import math
import sys
sys.path.insert(0, '../')
import soynlp
print(soynlp.__version__)

0.0.46


# Movie comment data

In [2]:
# movie comments
corpus_path = 'merged_comments.txt' # set your data path

# <idx, comment, rate>
with open(corpus_path, encoding='utf-8') as f:
    sentences = [doc.split('\t')[1] for doc in f]
    sentences = [sent for sent in sentences if sent]
print('num sentences = %d' % len(sentences))

num sentences = 294493


## Noun Extraction

In [3]:
from soynlp.noun import LRNounExtractor_v2

noun_extractor = LRNounExtractor_v2(
    max_left_length=10, max_right_length=9,
    verbose=True, min_num_of_features=2,
    max_frequency_when_noun_is_eojeol=10,
    min_eojeol_frequency=2,
    extract_compound=True, extract_pos_feature=False
)

noun_scores = noun_extractor.train_extract(
    sentences,
    min_noun_score=0.3,
    min_noun_frequency=2,  # 추출되는 명사의 최소 빈도수
    min_eojeol_frequency=1,
    reset_lrgraph=False    # predicator extraction 을 위해서
)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=1260, neg=1173, common=12
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 89170 from 294493 sents. mem=0.165 Gb                     
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. mem=0.296 Gb
[Noun Extractor] batch prediction was completed for 28290 words
[Noun Extractor] checked compounds. discovered 15 compounds
[Noun Extractor] postprocessing detaching_features : 9458 -> 9335
[Noun Extractor] postprocessing ignore_features : 9335 -> 9257
[Noun Extractor] postprocessing ignore_NJ : 9257 -> 9131
[Noun Extractor] 9131 nouns (15 compounds) with min frequency=2
[Noun Extractor] flushing was done. mem=0.332 Gb                    
[Noun Extractor] 68.04 % eojeols are covered


In [4]:
noun_scores['크리스토퍼']

NounScore(frequency=686, score=1.0)

In [5]:
for noun in sorted(noun_scores, key=lambda x:-math.sqrt(noun_scores[x].frequency) * noun_scores[x].score)[:30]:
    print('{}: {}'.format(noun, noun_scores[noun]))

영화: NounScore(frequency=61200, score=0.7619601007166377)
정말: NounScore(frequency=18842, score=1.0)
너무: NounScore(frequency=13708, score=1.0)
진짜: NounScore(frequency=14504, score=0.9568345323741008)
최고: NounScore(frequency=12792, score=0.8792405063291139)
감독: NounScore(frequency=6591, score=1.0)
재미: NounScore(frequency=6048, score=0.9528061224489796)
생각: NounScore(frequency=5616, score=0.9793271179570329)
배우들: NounScore(frequency=5065, score=0.9925449000338868)
ㅋㅋ: NounScore(frequency=4880, score=1.0)
감동: NounScore(frequency=4902, score=0.9840525328330206)
평점: NounScore(frequency=6226, score=0.8520084566596194)
마지막: NounScore(frequency=4508, score=1.0)
이런: NounScore(frequency=4861, score=0.904540379605508)
스토리: NounScore(frequency=4991, score=0.8579169175195666)
것: NounScore(frequency=3629, score=1.0)
그래: NounScore(frequency=3579, score=0.9900787861103005)
느낌: NounScore(frequency=3482, score=1.0)
내: NounScore(frequency=3778, score=0.9359138682389131)
대박: NounScore(frequency=3174, score=

## Predicator Extraction

In [6]:
from soynlp.predicator import PredicatorExtractor

# prepare materials from noun extractor
noun_pos_features = noun_extractor._pos_features
nouns = {noun for noun in noun_scores}

# initiate
predicator_extractor = PredicatorExtractor(
    nouns = nouns,
    noun_pos_features = noun_pos_features, # "명사 + 조사/형용사" 어절을 걸러내기 위함
    extract_eomi=True,
    extract_stem=True,
    verbose=True
)

predicator_extractor.train(
    sentences,
    min_eojeol_frequency = 3
)

print('num stems (before stem extraction) = %d' % len(predicator_extractor._stems))
print('num eomis (before eomi extraction) = %d' % len(predicator_extractor._eomis))

predicators = predicator_extractor.extract(
    min_predicator_frequency=3, reset_lrgraph=True,
    # Eomi extractor
    min_num_of_features=2, min_eomi_score=0.3, min_eomi_frequency=1,
    # Stem extractor
    min_num_of_unique_R_char=10, min_entropy_of_R_char=0.5,
    min_entropy_of_R=1.5, min_stem_score=0.7, min_stem_frequency=5
)

print('num stems (after stem extraction) = %d' % len(predicator_extractor._stems))
print('num eomis (after eomi extraction) = %d' % len(predicator_extractor._eomis))

[Predicator Extractor] counting eojeols was done. 60208 eojeols, mem=0.332 Gb                    
[Predicator Extractor] complete eojeol counter -> lr graph
[Predicator Extractor] has been trained. mem=0.332 Gb
num stems (before stem extraction) = 1248
num eomis (before eomi extraction) = 1120
[Eomi Extractor] batch prediction was completed for 5500 words
[Eomi Extractor] eomi lemmatization with 714 candidates
[Eomi Extractor] 1444 eomis extracted with min frequency = 1, min score = 0.3
[Predicator Extractor] 1170 eomis have been extracted
[Stem Extractor] Initializing was done with 1248 stems and 2290 eomisis
[Stem Extractor] batch prediction for 4695 candidates
[Stem Extractor] 37 stems, 18 surfacial stems, 13 removals
[Predicator Extractor] 37 stems have been extracted
[Predicator Extractor] lemma candidating was done. 64.430 % eojeols are covered
[Predicator Extractor] 6457 predicators are extracted
num stems (after stem extraction) = 1285
num eomis (after eomi extraction) = 2290


In [7]:
frequency_sorted_predicators = sorted(predicators, key=lambda x: -predicators[x].frequency)

word_sets = [
    frequency_sorted_predicators[:50],
    frequency_sorted_predicators[-500:-400]
]

for word_set in word_sets:
    for word in word_set:
        lemmas = predicators[word]
        print('{}: {}'.format(word, lemmas))
    print('--' * 30)

보고: Predicator(frequency=5899, lemma={('보', '고')})
내가: Predicator(frequency=3718, lemma={('내', '가'), ('낳', '아가')})
봤는데: Predicator(frequency=3560, lemma={('보', '았는데')})
하지만: Predicator(frequency=3507, lemma={('하', '지만')})
이건: Predicator(frequency=3414, lemma={('이', '건')})
있는: Predicator(frequency=2966, lemma={('있', '는'), ('이', 'ㅆ는')})
이렇게: Predicator(frequency=2891, lemma={('이렇', '게')})
없는: Predicator(frequency=2848, lemma={('없', '는')})
보는: Predicator(frequency=2775, lemma={('보', '는')})
없다: Predicator(frequency=2686, lemma={('없', '다')})
재밌게: Predicator(frequency=2573, lemma={('재밌', '게')})
하는: Predicator(frequency=2453, lemma={('하', '는')})
좋은: Predicator(frequency=2429, lemma={('좋', '은'), ('좋으', 'ㄴ')})
이게: Predicator(frequency=2391, lemma={('이', '게')})
내내: Predicator(frequency=2140, lemma={('낳', '아내'), ('내', '내')})
보세요: Predicator(frequency=2105, lemma={('보', '세요')})
보면: Predicator(frequency=2071, lemma={('보', '면')})
같다: Predicator(frequency=1847, lemma={('같', '다')})
같은: Predicator(freq

# Chat text

## Noun Extraction

In [8]:
import pickle
from soynlp.utils import LRGraph

# build noun extractor from {l:{r:count}} dict
with open('../tmp/lrgraph_dict_for_nounextraction.pkl', 'rb') as f:
    lrgraph = LRGraph(pickle.load(f))

chat_noun_extractor = LRNounExtractor_v2(
    extract_pos_feature=True
)

chat_noun_extractor.lrgraph = lrgraph
chat_noun_extractor._num_of_eojeols = lrgraph.to_EojeolCounter()._count_sum

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=1260, neg=1173, common=12


In [9]:
chat_noun_scores = chat_noun_extractor.extract(
    min_noun_score=0.3,
    min_noun_frequency=2,  # 추출되는 명사의 최소 빈도수
    reset_lrgraph=False    # predicator extraction 을 위해서
)

[Noun Extractor] extract and append pos features
[Noun Extractor] batch prediction for extracting pos feature
[Noun Extractor] batch prediction was completed for 167504 words
[Noun Extractor] features appended. pos=1260 -> 1260, neg=1173 -> 1173, common=12 -> 12
[Noun Extractor] 0 pos features were extracted
[Noun Extractor] batch prediction was completed for 167504 words
[Noun Extractor] checked compounds. discovered 13 compounds
[Noun Extractor] postprocessing detaching_features : 58046 -> 56981
[Noun Extractor] postprocessing ignore_features : 56981 -> 56867
[Noun Extractor] postprocessing ignore_NJ : 56867 -> 55706
[Noun Extractor] 55706 nouns (13 compounds) with min frequency=2
[Noun Extractor] flushing was done. mem=1.782 Gb                    
[Noun Extractor] 58.14 % eojeols are covered


In [10]:
for word in '여부 서가앤쿡 맘스터치 신라스테이 설입 '.split():
    print('{}: {}'.format(word, chat_noun_scores.get(word, None)))

여부: NounScore(frequency=4831, score=0.7579092159559835)
서가앤쿡: NounScore(frequency=121, score=1.0)
맘스터치: NounScore(frequency=336, score=0.9178082191780822)
신라스테이: NounScore(frequency=11, score=1.0)
설입: NounScore(frequency=71, score=1.0)


In [11]:
word_sets = [
    sorted(chat_noun_scores, key=lambda x:-chat_noun_scores[x].frequency)[:50],
    sorted(chat_noun_scores, key=lambda x:-chat_noun_scores[x].frequency)[3000:3050]
]
for word_set in word_sets:
    for noun in word_set:
        print('{}: {}'.format(noun, chat_noun_scores[noun]))
    print('--' * 30)

내: NounScore(frequency=253502, score=0.9581029653240607)
그래: NounScore(frequency=219531, score=0.8884650675055709)
오늘: NounScore(frequency=202989, score=0.9519450800915332)
진짜: NounScore(frequency=184587, score=0.8989955357142857)
근데: NounScore(frequency=173788, score=0.8535127055306427)
너무: NounScore(frequency=157721, score=0.9843899330997132)
그냥: NounScore(frequency=147720, score=0.7358490566037735)
오빠: NounScore(frequency=146246, score=0.9886392234405896)
지금: NounScore(frequency=140182, score=0.8701720220707563)
그럼: NounScore(frequency=132727, score=0.8043704474505723)
사랑: NounScore(frequency=121283, score=0.9992462852630823)
내일: NounScore(frequency=101316, score=0.7156619678401926)
많이: NounScore(frequency=100460, score=0.8667058132706987)
응응: NounScore(frequency=85628, score=1.0)
우리: NounScore(frequency=84382, score=0.9788417943257539)
너: NounScore(frequency=78604, score=0.9711176803572552)
미안: NounScore(frequency=73317, score=0.9991863447421951)
헤헤: NounScore(frequency=71407, scor

**TODO**
1. N is N+J postprocessing 에서 길이가 2 이하인 단어를 pass 하다보니 "욕 + 을" 이 명사로 추출. 두 글자의 명사는 각 글자가 명사, 조사인 경우가 많아서 이는 postprocessing 으로 처리하기 어려움

## Predicator Extraction

In [12]:
from soynlp.predicator import PredicatorExtractor

# prepare materials from noun extractor
chat_noun_pos_features = chat_noun_extractor._pos_features
chat_nouns = {noun for noun in chat_noun_scores}
chat_lrgraph = LRGraph(chat_noun_extractor.lrgraph._lr)

# initiate
chat_predicator_extractor = PredicatorExtractor(
    nouns = chat_nouns,
    noun_pos_features = chat_noun_pos_features, # "명사 + 조사/형용사" 어절을 걸러내기 위함
    extract_eomi=True,
    extract_stem=True,
    verbose=True
)

chat_predicator_extractor.train(
    #sentences,
    chat_lrgraph, # sentence 대신 LRGraph 를 넣어도 됨
    min_eojeol_frequency = 3
)

print('num stems (before stem extraction) = %d' % len(chat_predicator_extractor._stems))
print('num eomis (before eomi extraction) = %d' % len(chat_predicator_extractor._eomis))

chat_predicators = chat_predicator_extractor.extract(
    min_predicator_frequency=5, reset_lrgraph=True,
    # Eomi extractor
    min_num_of_features=4, min_eomi_score=0.3, min_eomi_frequency=10,
    # Stem extractor
    min_num_of_unique_R_char=5, min_entropy_of_R_char=0.5,
    min_entropy_of_R=1.5, min_stem_score=0.7, min_stem_frequency=5
)

print('num stems (after stem extraction) = %d' % len(chat_predicator_extractor._stems))
print('num eomis (after eomi extraction) = %d' % len(chat_predicator_extractor._eomis))

num stems (before stem extraction) = 1248
num eomis (before eomi extraction) = 1120
[Eomi Extractor] batch prediction was completed for 103217 words
[Eomi Extractor] eomi lemmatization with 5460 candidates
[Eomi Extractor] 10717 eomis extracted with min frequency = 10, min score = 0.3
[Predicator Extractor] 10201 eomis have been extracted
[Stem Extractor] Initializing was done with 1248 stems and 11321 eomisis
[Stem Extractor] batch prediction for 58670 candidates
[Stem Extractor] 2553 stems, 1585 surfacial stems, 2889 removals
[Predicator Extractor] 2522 stems have been extracted
[Predicator Extractor] lemma candidating was done. 69.797 % eojeols are covered
[Predicator Extractor] 75010 predicators are extracted
num stems (after stem extraction) = 3770
num eomis (after eomi extraction) = 11321


In [13]:
frequency_sorted_chat_predicators = sorted(chat_predicators, key=lambda x: -chat_predicators[x].frequency)

word_sets = [
    frequency_sorted_chat_predicators[:50],
    frequency_sorted_chat_predicators[10400:10500]
]

for word_set in word_sets:
    for word in word_set:
        lemmas = chat_predicators[word]
        print('{}: {}'.format(word, lemmas))
    print('--' * 30)

내가: Predicator(frequency=251537, lemma={('내', '가'), ('낳', '아가'), ('내그', '아')})
이제: Predicator(frequency=170348, lemma={('이', '제')})
나도: Predicator(frequency=150243, lemma={('나', '도'), ('낳', '도')})
나는: Predicator(frequency=96965, lemma={('나', '는'), ('낳', '는')})
자기: Predicator(frequency=88198, lemma={('자', '기')})
같이: Predicator(frequency=61417, lemma={('같', '이')})
다시: Predicator(frequency=43174, lemma={('닿', '시')})
그렇게: Predicator(frequency=41940, lemma={('그렇', '게')})
하고: Predicator(frequency=40197, lemma={('하', '고')})
나두: Predicator(frequency=40086, lemma={('나', '두'), ('낳', '두')})
너가: Predicator(frequency=38115, lemma={('너그', '아'), ('넣', '가')})
그리고: Predicator(frequency=34275, lemma={('그리', '고')})
빨리: Predicator(frequency=33545, lemma={('빨', '리')})
좋아: Predicator(frequency=30761, lemma={('좋', '아')})
어떻게: Predicator(frequency=29312, lemma={('어떻', '게')})
있어: Predicator(frequency=29049, lemma={('이', 'ㅆ어'), ('있', '어')})
이렇게: Predicator(frequency=26173, lemma={('이렇', '게')})
맞아: Predicator(fr

In [14]:
from soynlp.lemmatizer import _lemma_candidate

l = '먹'

for r, count in chat_predicator_extractor.lrgraph.get_r(l, topk=-1):
    word = l + r    
    word_is_extracted = word in chat_predicators

    is_stem = lambda x: x in chat_predicator_extractor._stems
    is_eomi = lambda x: x in chat_predicator_extractor._eomis
    lemmas = set()
    for i in range(len(r)+1):
        for stem, eomi in _lemma_candidate(l+r[:i], r[i:]):
            if is_stem(stem) and is_eomi(eomi):
                lemmas.add((stem, eomi))
    lemmas_strf = '' if not lemmas else '[%s]' % ', '.join('%s+%s'%lr for lr in lemmas)

    print('{} - {} ({}) / {}'.format(l, r, count, lemmas_strf))

먹 - 어 (13159) / [먹+어]
먹 - 었어 (10510) / [먹+었어]
먹 - 구 (9331) / [먹+구]
먹 - 으면 (5906) / [먹+으면]
먹 - 어요 (5638) / [먹+어요]
먹 - 었어요 (4201) / [먹+었어요]
먹 - 어야지 (3885) / [먹+어야지]
먹 - 어서 (3845) / [먹+어서]
먹 - 자 (3133) / [먹+자]
먹 - 엉 (2541) / [먹+엉]
먹 - 을 (2467) / [먹+을]
먹 - 어용 (2463) / [먹+어용]
먹 - 지 (2322) / [먹+지]
먹 - 어도 (2227) / [먹+어도]
먹 - 었는데 (1946) / [먹+었는데]
먹 - 을까 (1935) / [먹+을까]
먹 - 엇어 (1825) / [먹+엇어]
먹 - 었엉 (1513) / [먹+었엉]
먹 - 는데 (1502) / [먹+는데]
먹 - 는 (1495) / [먹+는]
먹 - 으면서 (1368) / [먹+으면서]
먹 - 는거 (1356) / [먹+는거]
먹 - 을게 (1342) / [먹+을게]
먹 - 었지 (1215) / [먹+었지]
먹 - 어야징 (1208) / [먹+어야징]
먹 - 어야 (1050) / [먹+어야]
먹 - 게 (1044) / [먹+게]
먹 - 는게 (992) / [먹+는게]
먹 - 으려고 (983) / [먹+으려고]
먹 - 는다 (954) / [먹+는다]
먹 - 었옹 (913) / [먹+었옹]
먹 - 기 (841) / [먹+기]
먹 - 은 (838) / [먹+은]
먹 - 으니까 (806) / [먹+으니까]
먹 - 네 (737) / [먹+네]
먹 - 었다 (707) / [먹+었다]
먹 - 어여 (694) / [먹+어여]
먹 - 으라고 (689) / [먹+으라고]
먹 - 어써 (663) / [먹+어써]
먹 - 었네 (660) / [먹+었네]
먹 - 어라 (655) / [먹+어라]
먹 - 었어용 (636) / [먹+었어용]
먹 - 으려구 (630) / [먹+으려구]
먹 - 오 (614) / [먹+오]
먹 - 는거야

먹 - 는거다 (21) / [먹+는거다]
먹 - 엇나봐 (21) / [먹+엇나봐]
먹 - 지두 (21) / [먹+지두]
먹 - 어뗘 (21) / [먹+어뗘]
먹 - 을게없다 (21) / 
먹 - 으려다 (21) / [먹+으려다]
먹 - 이는 (21) / [먹이+는]
먹 - 어야한다 (21) / [먹+어야한다]
먹 - 구여 (20) / [먹+구여]
먹 - 는거임 (20) / [먹+는거임]
먹 - 는데요 (20) / [먹+는데요]
먹 - 어야할 (20) / [먹+어야할]
먹 - 을둡 (20) / [먹+을둡]
먹 - 어서요 (20) / [먹+어서요]
먹 - 어야집 (20) / [먹+어야집]
먹 - 어야됭 (20) / 
먹 - 었는데요 (20) / [먹+었는데요]
먹 - 는다는데 (20) / [먹+는다는데]
먹 - 는다더니 (20) / [먹+는다더니]
먹 - 어주지 (20) / [먹+어주지]
먹 - 는데용 (20) / [먹+는데용]
먹 - 을려구용 (20) / 
먹 - 어줭 (20) / [먹+어줭]
먹 - 구싶구 (20) / 
먹 - 엇다구 (20) / [먹+엇다구]
먹 - 는거죠 (20) / [먹+는거죠]
먹 - 엇지용 (20) / [먹+엇지용]
먹 - 었거든요 (20) / [먹+었거든요]
먹 - 을지말지 (20) / 
먹 - 었소 (20) / [먹+었소]
먹 - 었네에 (20) / [먹+었네에]
먹 - 누 (20) / [먹+누]
먹 - 엇어여 (19) / [먹+엇어여]
먹 - 었꿍 (19) / 
먹 - 구있오용 (19) / [먹+구있오용]
먹 - 눈다 (19) / [먹+눈다]
먹 - 으라궁 (19) / 
먹 - 자했는데 (19) / [먹+자했는데]
먹 - 겠다는 (19) / [먹+겠다는]
먹 - 었어서 (19) / [먹+었어서]
먹 - 엇잖아 (19) / [먹+엇잖아]
먹 - 구왓 (19) / [먹+구왓]
먹 - 을테다 (19) / 
먹 - 을라햇는데 (19) / 
먹 - 었졉 (19) / [먹+었졉]
먹 - 았어 (19) / [먹+았어]
먹 - 어야댐 (19) 

먹 - 었겠네요 (7) / [먹+었겠네요]
먹 - 어여해 (7) / 
먹 - 자매 (7) / [먹+자매]
먹 - 어버리고 (7) / [먹+어버리고]
먹 - 엇성 (7) / 
먹 - 구있니 (7) / [먹+구있니]
먹 - 을랫 (7) / 
먹 - 는지는 (7) / [먹+는지는]
먹 - 기시른데 (7) / [먹+기시른데]
먹 - 어요오오 (7) / [먹+어요오오]
먹 - 어봐야겟다 (7) / 
먹 - 었어융 (7) / 
먹 - 구찌포 (7) / 
먹 - 었던건데 (7) / [먹+었던건데]
먹 - 구싶넹 (7) / 
먹 - 어야될거같아 (7) / 
먹 - 었셔 (7) / [먹+었셔]
먹 - 어봤거든 (7) / 
먹 - 어야겠담 (7) / 
먹 - 여살리면 (7) / 
먹 - 자햇어 (7) / [먹+자햇어]
먹 - 어또용 (7) / 
먹 - 구가야지 (7) / 
먹 - 었겠군 (7) / [먹+었겠군]
먹 - 었구먼 (7) / [먹+었구먼]
먹 - 자꾸나 (7) / [먹+자꾸나]
먹 - 구잇낭 (7) / 
먹 - 구잇옹 (7) / [먹+구잇옹]
먹 - 으려구했는데 (7) / 
먹 - 어야됨 (7) / [먹+어야됨]
먹 - 깨비 (7) / 
먹 - 을게없는데 (7) / 
먹 - 을겅 (7) / 
먹 - 었대서 (7) / 
먹 - 는고얌 (7) / [먹+는고얌]
먹 - 을깜 (7) / [먹+을깜]
먹 - 기싫어도 (7) / 
먹 - 었다냥 (7) / [먹+었다냥]
먹 - 었더요 (7) / 
먹 - 어야도 (7) / 
먹 - 는줄알고 (7) / [먹+는줄알고]
먹 - 구갈래 (7) / 
먹 - 으게 (7) / 
먹 - 죵 (7) / [먹+죵]
먹 - 을동안 (7) / [먹+을동안]
먹 - 엇데 (7) / [먹+엇데]
먹 - 었댱 (7) / 
먹 - 었댜앗 (7) / 
먹 - 겠다구 (7) / [먹+겠다구]
먹 - 겠다아 (7) / [먹+겠다아]
먹 - 지않는다 (7) / [먹+지않는다]
먹 - 게찌 (7) / [먹+게찌]
먹 - 는다묘 (7) / 
먹 - 어꾸 (7) / 


먹 - 구있는즁 (3) / 
먹 - 어준 (3) / [먹+어준]
먹 - 었숴 (3) / 
먹 - 구이쪄여 (3) / [먹+구이쪄여]
먹 - 겠니 (3) / [먹+겠니]
먹 - 어왜 (3) / 
먹 - 는거아녀 (3) / [먹+는거아녀]
먹 - 꽁 (3) / [먹+꽁]
먹 - 으려고해 (3) / 
먹 - 엇제 (3) / 
먹 - 어야하눈뎅 (3) / 
먹 - 으메 (3) / [먹+으메]
먹 - 으라더라 (3) / 
먹 - 진않아 (3) / [먹+진않아]
먹 - 자요오오 (3) / [먹+자요오오]
먹 - 었숑 (3) / 
먹 - 요 (3) / [먹+요]
먹 - 우려고 (3) / [먹우+려고, 먹+우려고]
먹 - 는거처럼 (3) / [먹+는거처럼]
먹 - 어보는거 (3) / 
먹 - 을께없어 (3) / 
먹 - 구싶어용 (3) / 
먹 - 었에 (3) / 
먹 - 어줄걸 (3) / [먹+어줄걸]
먹 - 기에도 (3) / [먹+기에도]
먹 - 으숑 (3) / 
먹 - 는줄알구 (3) / 
먹 - 어줘야하는데 (3) / 
먹 - 엇어임마 (3) / 
먹 - 눈다고 (3) / 
먹 - 으자 (3) / 
먹 - 는다잉 (3) / 
먹 - 어자갸 (3) / 
먹 - 구싶어져따 (3) / 
먹 - 어두됑 (3) / 
먹 - 골역에서 (3) / 
먹 - 을까욤 (3) / 
먹 - 겠지요 (3) / [먹+겠지요]
먹 - 으랑 (3) / 
먹 - 어야게썽 (3) / 
먹 - 는거얍 (3) / [먹+는거얍]
먹 - 었저 (3) / 
먹 - 었잖옹 (3) / 
먹 - 을라했지 (3) / 
먹 - 어봤낭 (3) / 
먹 - 어치울끄양 (3) / 
먹 - 어랍 (3) / 
먹 - 는이유 (3) / 
먹 - 었는댕 (3) / 
먹 - 어버렸옹 (3) / 
먹 - 어치우는 (3) / 
먹 - 궁하 (3) / 
먹 - 여줘요 (3) / [먹+여줘요, 먹이+어줘요]
먹 - 지않았나 (3) / [먹+지않았나]
먹 - 는댑 (3) / [먹+는댑]
먹 - 쉬돈나 (3) / 
먹 - 을텐대 (3) / 

In [15]:
l = '갈'
for r, count in chat_predicator_extractor.lrgraph.get_r(l, topk=-1):
    word = l + r    
    word_is_extracted = word in chat_predicators

    is_stem = lambda x: x in chat_predicator_extractor._stems
    is_eomi = lambda x: x in chat_predicator_extractor._eomis
    lemmas = set()
    for i in range(len(r)+1):
        for stem, eomi in _lemma_candidate(l+r[:i], r[i:]):
            if is_stem(stem) and is_eomi(eomi):
                lemmas.add((stem, eomi))
    lemmas_strf = '' if not lemmas else '[%s]' % ', '.join('%s+%s'%lr for lr in lemmas)

    print('{} - {} ({}) / {}'.format(l, r, count, lemmas_strf))

갈 - 게 (6120) / [갈+게, 가+ㄹ게]
갈 - 게요 (1852) / [가+ㄹ게요, 갈+게요]
갈 - 겡 (1219) / [갈+겡, 가+ㄹ겡]
갈 - 게용 (710) / [갈+게용, 가+ㄹ게용]
갈 - 려고 (660) / [가+ㄹ려고, 갈+려고]
갈 - 지 (525) / [갈+지, 가+ㄹ지]
갈 - 라구 (515) / [가+ㄹ라구, 가르+아구, 갈+라구]
갈 - 거 (431) / 
갈 - 려구 (413) / [가+ㄹ려구, 갈+려구]
갈 - 랭 (366) / [갈+랭, 가+ㄹ랭]
갈 - 게여 (301) / [가+ㄹ게여, 갈+게여]
갈 - 거같아 (298) / [가+ㄹ거같아, 갈+거같아]
갈 - 깡 (263) / [갈+깡, 가+ㄹ깡]
갈 - 껭 (257) / [가+ㄹ껭, 갈+껭]
갈 - 라궁 (206) / [갈+라궁, 가+ㄹ라궁, 가르+아궁]
갈 - 테니까 (201) / [가+ㄹ테니까, 갈+테니까]
갈 - 거면 (184) / [갈+거면, 가+ㄹ거면]
갈 - 거얌 (175) / [가+ㄹ거얌, 갈+거얌]
갈 - 려면 (154) / [갈+려면, 가+ㄹ려면]
갈 - 텐데 (153) / [가+ㄹ텐데, 갈+텐데]
갈 - 지도 (152) / [가+ㄹ지도, 갈+지도]
갈 - 땐 (147) / [갈+땐, 가+ㄹ땐]
갈 - 아입구 (143) / [갈아입+구]
갈 - 거양 (140) / [가+ㄹ거양, 갈+거양]
갈 - 거임 (124) / [갈+거임, 가+ㄹ거임]
갈 - 려궁 (121) / [가+ㄹ려궁, 갈+려궁]
갈 - 라면 (121) / [갈+라면, 가+ㄹ라면, 가르+아면]
갈 - 거지 (120) / 
갈 - 아 (117) / [가+ㄹ아, 갈+아]
갈 - 려구요 (111) / [갈+려구요, 가+ㄹ려구요]
갈 - 거같은데 (107) / [가+ㄹ거같은데, 갈+거같은데]
갈 - 아타고 (106) / 
갈 - 아타 (103) / 
갈 - 아타서 (102) / 
갈 - 테니 (102) / [갈+테니, 가+ㄹ테니]
갈 - 거니까 (102) / [가+ㄹ거니까, 갈+거니까]
갈 - 아서 