In [6]:
import sqlite3
import pandas as pd
from soynlp.noun import LRNounExtractor_v2
from collections import defaultdict
import re
import numpy as np
import math
from string import punctuation
from soynlp.word import WordExtractor
from soynlp.utils import DoublespaceLineCorpus

# Stopwords 처리
pattern1 = re.compile(r'[{}]'.format(re.escape(punctuation))) # punctuation 제거
pattern2 = re.compile(r'[^가-힣 ]') # 특수문자, 모음, 숫자, 영어 제거
pattern3 = re.compile(r'\s{2,}') # white space 1개로 바꾸기.

class Extracter:
    def __init__(self, df):
        self.df = df
        self.noun_extractor = LRNounExtractor_v2(verbose=True)
        self.word_extractor = WordExtractor(min_frequency=math.floor(len(self.df)*0.0001))
        
    def cleaning(self):
        self.df['head'] = self.df['head'].map(lambda x:pattern3.sub(' ',
                                                        pattern2.sub('',
                                                         pattern1.sub('', x))))
        return self.df

    def extract_nouns(self):
        tempary = np.linspace(0,1,11)
        nouns = [self.noun_extractor.train_extract(_['head'], min_noun_frequency=math.floor(len(self.df)*0.0001)) for _ in [self.df.iloc[math.ceil(len(self.df)*tempary[_]):math.ceil(len(self.df)*tempary[_+1])] for _ in range(len(tempary)-1)]]
        words = {k:v for i in range(len(nouns)) for k,v in nouns[i].items() if len(k) > 1}
        return words

    def search_dict(self,nouns):
        # 사전 검색 결과 없는 단어 추출
        conn = sqlite3.connect('kr_korean.db')
        cur = conn.cursor()
        data = pd.read_sql('SELECT word FROM kr', conn)
        data = ' '.join(data['word'])
        return pd.DataFrame([_ for _ in nouns if _[0] not in data])
    
    # 의미 추출을 위한 training data set 생성
    def extract_sent(self, words):
        sent = defaultdict(lambda:0)
        for w in new_words[0]:
            temp = [s for s in df['head'] if w in s]
            sent[w] = '  '.join(temp)
        return sent
            
    def extract_statistic_value(self, sent):
        statistic = defaultdict(lambda:0)
        for k,v in sent.items():
            self.word_extractor.train([v])
            try:
                statistic[k] = self.word_extractor.extract()[k]
            except Exception as e:
                print(e)
        return statistic
    
    def extract_r_rat(self, sent, statistic):
        conn = sqlite3.connect('kr_korean.db')
        cur = conn.cursor()
        post_pos = pd.read_sql('SELECT word FROM kr WHERE part="조사"', conn)
        post_pos['word'] = post_pos['word'].map(lambda x: pattern3.sub(' ',
                                                            pattern2.sub('',
                                                             pattern1.sub('', x))))
        post_pos.drop_duplicates(keep='first', inplace=True)
        post_pos = ''.join(post_pos['word'])
        r_rat = defaultdict(lambda:0)
        for k in statistic.keys():
            try:
                self.noun_extractor.train_extract([sent[k]])
                count = pprat = wsrat = 0
                for _ in self.noun_extractor.lrgraph.get_r(k, topk=-1):
                    if _[0] in post_pos:
                        if _[0] != '':
                            pprat += _[1]
                        elif _[0] == '':
                            wsrat = _[1]
                for _ in self.noun_extractor.lrgraph.get_r(k, topk=-1):
                    count += _[1]

                r_rat[k] = {'rpprat':pprat/count, 'rwsrat':wsrat/count}
            except Exception as e:
                print(e)
        return r_rat

In [7]:
conn = sqlite3.connect('Humor.db')
cur = conn.cursor()
df = pd.read_sql('SELECT head FROM head',conn)
df.drop_duplicates(keep='first', inplace=True)
ext = Extracter(df)
df = ext.cleaning()

new_words = ext.search_dict(sorted(ext.extract_nouns().items(),key=lambda _:_[1], reverse=True))
sent = ext.extract_sent(new_words)

# 변수 생성
statistic = ext.extract_statistic_value(sent)
# rpprat : 명사의 오른쪽에 조사가 오는 비율, rwsrat : 명사의 오른쪽에 white space가 오는 비율
r_rat = ext.extract_r_rat(sent,statistic)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 153629 from 127907 sents. mem=0.836 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=556166, mem=0.842 Gb
[Noun Extractor] batch prediction was completed for 40185 words
[Noun Extractor] checked compounds. discovered 27173 compounds
[Noun Extractor] postprocessing detaching_features : 426 -> 423
[Noun Extractor] postprocessing ignore_features : 423 -> 397
[Noun Extractor] postprocessing ignore_NJ : 397 -> 395
[Noun Extractor] 395 nouns (27173 compounds) with min frequency=127
[Noun Extractor] flushing was done. mem=0.879 Gb                    
[Noun Extractor] 31.14 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 156261 from 127907 sents. mem=0.933 Gb                    
[Noun Extractor] complete eojeol counter -> lr grap

all branching entropies was computed # words = 21929
all accessor variety was computed # words = 21929
training was done. used memory 1.118 Gb1.118 Gb
all cohesion probabilities was computed. # words = 170
all branching entropies was computed # words = 25059
all accessor variety was computed # words = 25059
training was done. used memory 1.118 Gb1.118 Gb
all cohesion probabilities was computed. # words = 179
all branching entropies was computed # words = 26184
all accessor variety was computed # words = 26184
training was done. used memory 1.118 Gb1.118 Gb
all cohesion probabilities was computed. # words = 205
all branching entropies was computed # words = 28439
all accessor variety was computed # words = 28439
training was done. used memory 1.118 Gb1.118 Gb
all cohesion probabilities was computed. # words = 205
all branching entropies was computed # words = 28968
all accessor variety was computed # words = 28968
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilitie

all branching entropies was computed # words = 62052
all accessor variety was computed # words = 62052
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 744
all branching entropies was computed # words = 62144
all accessor variety was computed # words = 62144
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 749
all branching entropies was computed # words = 62195
all accessor variety was computed # words = 62195
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 756
all branching entropies was computed # words = 62272
all accessor variety was computed # words = 62272
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 759
all branching entropies was computed # words = 62331
all accessor variety was computed # words = 62331
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilitie

all branching entropies was computed # words = 70253
all accessor variety was computed # words = 70253
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 919
all branching entropies was computed # words = 70391
all accessor variety was computed # words = 70391
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 922
all branching entropies was computed # words = 70421
all accessor variety was computed # words = 70421
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 925
all branching entropies was computed # words = 70456
all accessor variety was computed # words = 70456
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 928
all branching entropies was computed # words = 70524
all accessor variety was computed # words = 70524
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilitie

training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1089
all branching entropies was computed # words = 76805
all accessor variety was computed # words = 76805
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1092
all branching entropies was computed # words = 76860
all accessor variety was computed # words = 76860
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1094
all branching entropies was computed # words = 76881
all accessor variety was computed # words = 76881
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1097
all branching entropies was computed # words = 77212
all accessor variety was computed # words = 77212
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1106
all branching entropies was computed # words = 77262
all accessor va

all accessor variety was computed # words = 81366
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1237
all branching entropies was computed # words = 81385
all accessor variety was computed # words = 81385
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1241
all branching entropies was computed # words = 81524
all accessor variety was computed # words = 81524
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1245
all branching entropies was computed # words = 81549
all accessor variety was computed # words = 81549
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1246
all branching entropies was computed # words = 81591
all accessor variety was computed # words = 81591
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1251
all branching entr

all branching entropies was computed # words = 84646
all accessor variety was computed # words = 84646
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1379
all branching entropies was computed # words = 84697
all accessor variety was computed # words = 84697
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1384
all branching entropies was computed # words = 84811
all accessor variety was computed # words = 84811
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1384
all branching entropies was computed # words = 84822
all accessor variety was computed # words = 84822
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1387
all branching entropies was computed # words = 84844
all accessor variety was computed # words = 84844
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabil

training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1500
all branching entropies was computed # words = 86844
all accessor variety was computed # words = 86844
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1506
all branching entropies was computed # words = 87008
all accessor variety was computed # words = 87008
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1508
all branching entropies was computed # words = 87175
all accessor variety was computed # words = 87175
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1510
all branching entropies was computed # words = 87208
all accessor variety was computed # words = 87208
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1513
all branching entropies was computed # words = 87259
all accessor va

all accessor variety was computed # words = 90363
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1637
all branching entropies was computed # words = 90414
all accessor variety was computed # words = 90414
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1644
all branching entropies was computed # words = 90480
all accessor variety was computed # words = 90480
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1645
all branching entropies was computed # words = 90490
all accessor variety was computed # words = 90490
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1647
all branching entropies was computed # words = 90525
all accessor variety was computed # words = 90525
training was done. used memory 1.128 Gb1.128 Gb
all cohesion probabilities was computed. # words = 1649
all branching entr

all branching entropies was computed # words = 93457
all accessor variety was computed # words = 93457
training was done. used memory 1.138 Gb1.138 Gb
all cohesion probabilities was computed. # words = 1766
all branching entropies was computed # words = 93573
all accessor variety was computed # words = 93573
training was done. used memory 1.138 Gb1.138 Gb
all cohesion probabilities was computed. # words = 1769
all branching entropies was computed # words = 93594
all accessor variety was computed # words = 93594
training was done. used memory 1.138 Gb1.138 Gb
all cohesion probabilities was computed. # words = 1769
all branching entropies was computed # words = 93631
all accessor variety was computed # words = 93631
training was done. used memory 1.138 Gb1.138 Gb
all cohesion probabilities was computed. # words = 1773
all branching entropies was computed # words = 93688
all accessor variety was computed # words = 93688
training was done. used memory 1.138 Gb1.138 Gb
all cohesion probabil

training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2021
all branching entropies was computed # words = 102531
all accessor variety was computed # words = 102531
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2028
all branching entropies was computed # words = 102671
all accessor variety was computed # words = 102671
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2029
all branching entropies was computed # words = 102690
all accessor variety was computed # words = 102690
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2034
all branching entropies was computed # words = 102782
all accessor variety was computed # words = 102782
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2036
all branching entropies was computed # words = 102798
all ac

training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2152
all branching entropies was computed # words = 105341
all accessor variety was computed # words = 105341
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2153
all branching entropies was computed # words = 105379
all accessor variety was computed # words = 105379
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2156
all branching entropies was computed # words = 105417
all accessor variety was computed # words = 105417
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2159
all branching entropies was computed # words = 105443
all accessor variety was computed # words = 105443
training was done. used memory 1.207 Gb1.207 Gb
all cohesion probabilities was computed. # words = 2162
all branching entropies was computed # words = 105624
all ac

[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 25911 from 1 sents. mem=1.095 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=51956, mem=1.095 Gb
[Noun Extractor] batch prediction was completed for 9025 words
[Noun Extractor] checked compounds. discovered 2780 compounds
[Noun Extractor] postprocessing detaching_features : 4849 -> 4624
[Noun Extractor] postprocessing ignore_features : 4624 -> 4574
[Noun Extractor] postprocessing ignore_NJ : 4574 -> 4568
[Noun Extractor] 4568 nouns (2780 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 61.93 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 7731 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=17637, mem=1.093 Gb
[Noun Extractor] batch prediction was completed 

[Noun Extractor] checked compounds. discovered 50 compounds
[Noun Extractor] postprocessing detaching_features : 241 -> 233
[Noun Extractor] postprocessing ignore_features : 233 -> 222
[Noun Extractor] postprocessing ignore_NJ : 222 -> 221
[Noun Extractor] 221 nouns (50 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 46.65 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 5208 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=10282, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1793 words
[Noun Extractor] checked compounds. discovered 177 compounds
[Noun Extractor] postprocessing detaching_features : 710 -> 678
[Noun Extractor] postprocessing ignore_features : 678 -> 653
[Noun Extractor] postprocessing ignore_NJ : 653 -> 650
[Noun Extractor] 650 nouns (177 compounds) with m

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 46.83 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 10006 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=21637, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 3389 words
[Noun Extractor] checked compounds. discovered 594 compounds
[Noun Extractor] postprocessing detaching_features : 1626 -> 1442
[Noun Extractor] postprocessing ignore_features : 1442 -> 1408
[Noun Extractor] postprocessing ignore_NJ : 1408 -> 1402
[Noun Extractor] 1402 nouns (594 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 59.01 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2290 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph


[Noun Extractor] checked compounds. discovered 44 compounds
[Noun Extractor] postprocessing detaching_features : 284 -> 265
[Noun Extractor] postprocessing ignore_features : 265 -> 255
[Noun Extractor] postprocessing ignore_NJ : 255 -> 254
[Noun Extractor] 254 nouns (44 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 45.91 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2019 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=3325, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 644 words
[Noun Extractor] checked compounds. discovered 21 compounds
[Noun Extractor] postprocessing detaching_features : 232 -> 232
[Noun Extractor] postprocessing ignore_features : 232 -> 223
[Noun Extractor] postprocessing ignore_NJ : 223 -> 222
[Noun Extractor] 222 nouns (21 compounds) with min f

[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 4693 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=8084, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1476 words
[Noun Extractor] checked compounds. discovered 82 compounds
[Noun Extractor] postprocessing detaching_features : 512 -> 506
[Noun Extractor] postprocessing ignore_features : 506 -> 491
[Noun Extractor] postprocessing ignore_NJ : 491 -> 490
[Noun Extractor] 490 nouns (82 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 47.56 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1660 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=2777, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 503 words


[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 54.94 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 5937 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=11362, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1844 words
[Noun Extractor] checked compounds. discovered 98 compounds
[Noun Extractor] postprocessing detaching_features : 561 -> 555
[Noun Extractor] postprocessing ignore_features : 555 -> 533
[Noun Extractor] postprocessing ignore_NJ : 533 -> 533
[Noun Extractor] 533 nouns (98 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 29.32 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1080 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extr

[Noun Extractor] checked compounds. discovered 16 compounds
[Noun Extractor] postprocessing detaching_features : 143 -> 143
[Noun Extractor] postprocessing ignore_features : 143 -> 141
[Noun Extractor] postprocessing ignore_NJ : 141 -> 141
[Noun Extractor] 141 nouns (16 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 46.58 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 6533 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=10913, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 2429 words
[Noun Extractor] checked compounds. discovered 199 compounds
[Noun Extractor] postprocessing detaching_features : 784 -> 775
[Noun Extractor] postprocessing ignore_features : 775 -> 749
[Noun Extractor] postprocessing ignore_NJ : 749 -> 747
[Noun Extractor] 747 nouns (199 compounds) with m

[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 825 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1603, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 253 words
[Noun Extractor] checked compounds. discovered 8 compounds
[Noun Extractor] postprocessing detaching_features : 88 -> 87
[Noun Extractor] postprocessing ignore_features : 87 -> 79
[Noun Extractor] postprocessing ignore_NJ : 79 -> 79
[Noun Extractor] 79 nouns (8 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 54.27 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2238 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=3922, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 783 words
[Noun Extra

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 47.43 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1138 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=2065, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 386 words
[Noun Extractor] checked compounds. discovered 15 compounds
[Noun Extractor] postprocessing detaching_features : 113 -> 113
[Noun Extractor] postprocessing ignore_features : 113 -> 107
[Noun Extractor] postprocessing ignore_NJ : 107 -> 107
[Noun Extractor] 107 nouns (15 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 45.67 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1799 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extrac

[Noun Extractor] checked compounds. discovered 158 compounds
[Noun Extractor] postprocessing detaching_features : 647 -> 631
[Noun Extractor] postprocessing ignore_features : 631 -> 618
[Noun Extractor] postprocessing ignore_NJ : 618 -> 618
[Noun Extractor] 618 nouns (158 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 53.99 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2920 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=5984, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 910 words
[Noun Extractor] checked compounds. discovered 33 compounds
[Noun Extractor] postprocessing detaching_features : 327 -> 324
[Noun Extractor] postprocessing ignore_features : 324 -> 314
[Noun Extractor] postprocessing ignore_NJ : 314 -> 312
[Noun Extractor] 312 nouns (33 compounds) with min

[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1435 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=2444, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 461 words
[Noun Extractor] checked compounds. discovered 8 compounds
[Noun Extractor] postprocessing detaching_features : 116 -> 116
[Noun Extractor] postprocessing ignore_features : 116 -> 109
[Noun Extractor] postprocessing ignore_NJ : 109 -> 109
[Noun Extractor] 109 nouns (8 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 40.67 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 918 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1529, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 292 words
[Nou

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 49.36 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 7054 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=16438, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 2954 words
[Noun Extractor] checked compounds. discovered 191 compounds
[Noun Extractor] postprocessing detaching_features : 1006 -> 981
[Noun Extractor] postprocessing ignore_features : 981 -> 949
[Noun Extractor] postprocessing ignore_NJ : 949 -> 946
[Noun Extractor] 946 nouns (191 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 48.13 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 915 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Ex

[Noun Extractor] checked compounds. discovered 213 compounds
[Noun Extractor] postprocessing detaching_features : 748 -> 731
[Noun Extractor] postprocessing ignore_features : 731 -> 709
[Noun Extractor] postprocessing ignore_NJ : 709 -> 708
[Noun Extractor] 708 nouns (213 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 49.99 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 4387 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=8380, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1348 words
[Noun Extractor] checked compounds. discovered 113 compounds
[Noun Extractor] postprocessing detaching_features : 420 -> 403
[Noun Extractor] postprocessing ignore_features : 403 -> 380
[Noun Extractor] postprocessing ignore_NJ : 380 -> 377
[Noun Extractor] 377 nouns (113 compounds) with 

[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2058 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=3747, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 695 words
[Noun Extractor] checked compounds. discovered 41 compounds
[Noun Extractor] postprocessing detaching_features : 226 -> 222
[Noun Extractor] postprocessing ignore_features : 222 -> 215
[Noun Extractor] postprocessing ignore_NJ : 215 -> 215
[Noun Extractor] 215 nouns (41 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 47.45 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 684 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1136, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 244 words
[N

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 22.01 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 978 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1658, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 295 words
[Noun Extractor] checked compounds. discovered 14 compounds
[Noun Extractor] postprocessing detaching_features : 105 -> 105
[Noun Extractor] postprocessing ignore_features : 105 -> 102
[Noun Extractor] postprocessing ignore_NJ : 102 -> 102
[Noun Extractor] 102 nouns (14 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 45.42 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 4415 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extract

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 54.83 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 5038 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=10506, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1403 words
[Noun Extractor] checked compounds. discovered 57 compounds
[Noun Extractor] postprocessing detaching_features : 485 -> 482
[Noun Extractor] postprocessing ignore_features : 482 -> 470
[Noun Extractor] postprocessing ignore_NJ : 470 -> 470
[Noun Extractor] 470 nouns (57 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 46.92 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2316 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extr

[Noun Extractor] checked compounds. discovered 4 compounds
[Noun Extractor] postprocessing detaching_features : 49 -> 48
[Noun Extractor] postprocessing ignore_features : 48 -> 43
[Noun Extractor] postprocessing ignore_NJ : 43 -> 43
[Noun Extractor] 43 nouns (4 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 37.69 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1357 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=2475, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 442 words
[Noun Extractor] checked compounds. discovered 26 compounds
[Noun Extractor] postprocessing detaching_features : 159 -> 159
[Noun Extractor] postprocessing ignore_features : 159 -> 151
[Noun Extractor] postprocessing ignore_NJ : 151 -> 151
[Noun Extractor] 151 nouns (26 compounds) with min frequency=

[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1771, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 323 words
[Noun Extractor] checked compounds. discovered 15 compounds
[Noun Extractor] postprocessing detaching_features : 106 -> 103
[Noun Extractor] postprocessing ignore_features : 103 -> 98
[Noun Extractor] postprocessing ignore_NJ : 98 -> 98
[Noun Extractor] 98 nouns (15 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 45.62 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1157 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1788, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 356 words
[Noun Extractor] checked compounds. discovered 4 compounds
[Noun Extractor] postprocessing detaching_features : 95 -> 

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 47.17 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 952 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1457, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 302 words
[Noun Extractor] checked compounds. discovered 2 compounds
[Noun Extractor] postprocessing detaching_features : 55 -> 54
[Noun Extractor] postprocessing ignore_features : 54 -> 51
[Noun Extractor] postprocessing ignore_NJ : 51 -> 51
[Noun Extractor] 51 nouns (2 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 33.22 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1609 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has b

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 42.69 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1976 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=3610, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 601 words
[Noun Extractor] checked compounds. discovered 12 compounds
[Noun Extractor] postprocessing detaching_features : 180 -> 177
[Noun Extractor] postprocessing ignore_features : 177 -> 166
[Noun Extractor] postprocessing ignore_NJ : 166 -> 165
[Noun Extractor] 165 nouns (12 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 42.41 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 540 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extract

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 37.89 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 998 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1518, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 265 words
[Noun Extractor] checked compounds. discovered 4 compounds
[Noun Extractor] postprocessing detaching_features : 70 -> 70
[Noun Extractor] postprocessing ignore_features : 70 -> 67
[Noun Extractor] postprocessing ignore_NJ : 67 -> 67
[Noun Extractor] 67 nouns (4 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 37.42 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 736 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has be

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 42.05 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2830 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=5334, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 890 words
[Noun Extractor] checked compounds. discovered 42 compounds
[Noun Extractor] postprocessing detaching_features : 271 -> 269
[Noun Extractor] postprocessing ignore_features : 269 -> 253
[Noun Extractor] postprocessing ignore_NJ : 253 -> 253
[Noun Extractor] 253 nouns (42 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 43.68 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1679 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extrac

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 38.78 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1260 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1996, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 410 words
[Noun Extractor] checked compounds. discovered 7 compounds
[Noun Extractor] postprocessing detaching_features : 111 -> 110
[Noun Extractor] postprocessing ignore_features : 110 -> 105
[Noun Extractor] postprocessing ignore_NJ : 105 -> 104
[Noun Extractor] 104 nouns (7 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 38.68 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 664 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 42.36 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 10041 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=19218, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 3310 words
[Noun Extractor] checked compounds. discovered 275 compounds
[Noun Extractor] postprocessing detaching_features : 1148 -> 1099
[Noun Extractor] postprocessing ignore_features : 1099 -> 1064
[Noun Extractor] postprocessing ignore_NJ : 1064 -> 1064
[Noun Extractor] 1064 nouns (275 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 46.94 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 895 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 35.79 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1999 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=3698, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 709 words
[Noun Extractor] checked compounds. discovered 19 compounds
[Noun Extractor] postprocessing detaching_features : 203 -> 202
[Noun Extractor] postprocessing ignore_features : 202 -> 198
[Noun Extractor] postprocessing ignore_NJ : 198 -> 198
[Noun Extractor] 198 nouns (19 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 46.57 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1724 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extrac

[Noun Extractor] checked compounds. discovered 9 compounds
[Noun Extractor] postprocessing detaching_features : 136 -> 134
[Noun Extractor] postprocessing ignore_features : 134 -> 124
[Noun Extractor] postprocessing ignore_NJ : 124 -> 124
[Noun Extractor] 124 nouns (9 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 41.22 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 3826 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=7434, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1447 words
[Noun Extractor] checked compounds. discovered 84 compounds
[Noun Extractor] postprocessing detaching_features : 532 -> 531
[Noun Extractor] postprocessing ignore_features : 531 -> 513
[Noun Extractor] postprocessing ignore_NJ : 513 -> 513
[Noun Extractor] 513 nouns (84 compounds) with min fr

[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=941, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 206 words
[Noun Extractor] checked compounds. discovered 12 compounds
[Noun Extractor] postprocessing detaching_features : 59 -> 58
[Noun Extractor] postprocessing ignore_features : 58 -> 52
[Noun Extractor] postprocessing ignore_NJ : 52 -> 52
[Noun Extractor] 52 nouns (12 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 45.38 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 853 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1327, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 273 words
[Noun Extractor] checked compounds. discovered 5 compounds
[Noun Extractor] postprocessing detaching_features : 66 -> 65
[N

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 36.20 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 4321 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=8200, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1695 words
[Noun Extractor] checked compounds. discovered 68 compounds
[Noun Extractor] postprocessing detaching_features : 454 -> 454
[Noun Extractor] postprocessing ignore_features : 454 -> 440
[Noun Extractor] postprocessing ignore_NJ : 440 -> 440
[Noun Extractor] 440 nouns (68 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 38.87 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1987 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extra

[Noun Extractor] checked compounds. discovered 8 compounds
[Noun Extractor] postprocessing detaching_features : 88 -> 88
[Noun Extractor] postprocessing ignore_features : 88 -> 87
[Noun Extractor] postprocessing ignore_NJ : 87 -> 87
[Noun Extractor] 87 nouns (8 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 40.06 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 720 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1267, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 180 words
[Noun Extractor] checked compounds. discovered 1 compounds
[Noun Extractor] postprocessing detaching_features : 47 -> 47
[Noun Extractor] postprocessing ignore_features : 47 -> 41
[Noun Extractor] postprocessing ignore_NJ : 41 -> 41
[Noun Extractor] 41 nouns (1 compounds) with min frequency=1
[Noun Ex

[EojeolCounter] n eojeol = 812 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1278, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 238 words
[Noun Extractor] checked compounds. discovered 3 compounds
[Noun Extractor] postprocessing detaching_features : 52 -> 52
[Noun Extractor] postprocessing ignore_features : 52 -> 50
[Noun Extractor] postprocessing ignore_NJ : 50 -> 50
[Noun Extractor] 50 nouns (3 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 34.82 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 3350 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=5381, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1034 words
[Noun Extractor] checked compounds. discover

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 50.44 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1083 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1695, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 330 words
[Noun Extractor] checked compounds. discovered 9 compounds
[Noun Extractor] postprocessing detaching_features : 99 -> 99
[Noun Extractor] postprocessing ignore_features : 99 -> 96
[Noun Extractor] postprocessing ignore_NJ : 96 -> 96
[Noun Extractor] 96 nouns (9 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 42.89 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1108 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has 

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 40.73 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 3106 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=5327, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1012 words
[Noun Extractor] checked compounds. discovered 24 compounds
[Noun Extractor] postprocessing detaching_features : 267 -> 267
[Noun Extractor] postprocessing ignore_features : 267 -> 255
[Noun Extractor] postprocessing ignore_NJ : 255 -> 255
[Noun Extractor] 255 nouns (24 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 39.93 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 682 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extrac

[Noun Extractor] checked compounds. discovered 97 compounds
[Noun Extractor] postprocessing detaching_features : 399 -> 377
[Noun Extractor] postprocessing ignore_features : 377 -> 355
[Noun Extractor] postprocessing ignore_NJ : 355 -> 354
[Noun Extractor] 354 nouns (97 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 48.89 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1850 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=3215, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 698 words
[Noun Extractor] checked compounds. discovered 8 compounds
[Noun Extractor] postprocessing detaching_features : 149 -> 149
[Noun Extractor] postprocessing ignore_features : 149 -> 140
[Noun Extractor] postprocessing ignore_NJ : 140 -> 140
[Noun Extractor] 140 nouns (8 compounds) with min fre

[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=7728, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1245 words
[Noun Extractor] checked compounds. discovered 86 compounds
[Noun Extractor] postprocessing detaching_features : 470 -> 453
[Noun Extractor] postprocessing ignore_features : 453 -> 435
[Noun Extractor] postprocessing ignore_NJ : 435 -> 434
[Noun Extractor] 434 nouns (86 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 46.49 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2399 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=3832, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 778 words
[Noun Extractor] checked compounds. discovered 42 compounds
[Noun Extractor] postprocessing detaching_features : 

[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 43.90 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 5164 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=8252, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 1807 words
[Noun Extractor] checked compounds. discovered 35 compounds
[Noun Extractor] postprocessing detaching_features : 474 -> 473
[Noun Extractor] postprocessing ignore_features : 473 -> 458
[Noun Extractor] postprocessing ignore_NJ : 458 -> 458
[Noun Extractor] 458 nouns (35 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 24.44 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 2522 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extra

[Noun Extractor] postprocessing ignore_features : 125 -> 116
[Noun Extractor] postprocessing ignore_NJ : 116 -> 116
[Noun Extractor] 116 nouns (17 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 42.79 % eojeols are covered
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 1133 from 1 sents. mem=1.093 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=1965, mem=1.093 Gb
[Noun Extractor] batch prediction was completed for 357 words
[Noun Extractor] checked compounds. discovered 15 compounds
[Noun Extractor] postprocessing detaching_features : 98 -> 98
[Noun Extractor] postprocessing ignore_features : 98 -> 96
[Noun Extractor] postprocessing ignore_NJ : 96 -> 96
[Noun Extractor] 96 nouns (15 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.093 Gb                    
[Noun Extractor] 44.94 % eojeols are covered


In [8]:
r_rat

defaultdict(<function __main__.Extracter.extract_r_rat.<locals>.<lambda>()>,
            {'신천지': {'rpprat': 0.22842105263157894,
              'rwsrat': 0.6547368421052632},
             '확진자': {'rpprat': 0.17532325224632916,
              'rwsrat': 0.7565198334429104},
             '알아보자': {'rpprat': 0.007107133456172677,
              'rwsrat': 0.9648153022725279},
             '문재앙': {'rpprat': 0.20966010733452595,
              'rwsrat': 0.7220035778175313},
             '요즘': {'rpprat': 0.05871949062610541,
              'rwsrat': 0.8906968517863459},
             '재업': {'rpprat': 0.009609224855861628,
              'rwsrat': 0.5675848814862268},
             '너네': {'rpprat': 0.1800162910670649,
              'rwsrat': 0.7303828400760249},
             '아이돌': {'rpprat': 0.15474947807933195,
              'rwsrat': 0.7210334029227558},
             '얘들아': {'rpprat': 0.004133654839820875,
              'rwsrat': 0.9724423010678608},
             '스압': {'rpprat': 0.00490402129746392,