# 한글 형태소 분석기 - SOYNLP

In [5]:
pip install soynlp

Note: you may need to restart the kernel to use updated packages.


In [6]:
import soynlp
soynlp.__version__

'0.0.493'

In [9]:
import urllib
urllib.request.urlretrieve("https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt", filename="2016-10-20.txt")

('2016-10-20.txt', <http.client.HTTPMessage at 0x1884a7b8280>)

- SOYNLP에서 사용될 단어 만들기

In [18]:
# 훈련 데이터를 다수의 문서로 분리
from soynlp import DoublespaceLineCorpus

corpus = DoublespaceLineCorpus("2016-10-20.txt")
len(corpus)  # 문서의 갯수


30091

In [19]:
# 전체 말뭉치에서 단어 점수표를 계산(학습)
from soynlp.word import WordExtractor

word_extractor = WordExtractor()
word_extractor.train(corpus)
word_score_table = word_extractor.extract()

training was done. used memory 0.760 Gb
all cohesion probabilities was computed. # words = 223348
all branching entropies was computed # words = 361598
all accessor variety was computed # words = 361598


In [20]:
from soynlp.tokenizer import LTokenizer

scores = {word:score.cohesion_forward for word, score in word_score_table.items()}
l_tokenizer = LTokenizer(scores=scores)
l_tokenizer.tokenize("국제사회와 우리의 노력들로 범죄를 척결하자", flatten=False)

[('국제사회', '와'), ('우리', '의'), ('노력', '들로'), ('범죄', '를'), ('척결', '하자')]

In [21]:
import joblib

joblib.dump(scores, 'data/scores.pkl')

['data/scores.pkl']

- 명사 추출

In [22]:
from soynlp.noun import LRNounExtractor_v2

noun_extractor = LRNounExtractor_v2(verbose=True)
nouns = noun_extractor.train_extract(corpus)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 403896 from 30091 sents. mem=0.829 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=4434442, mem=1.461 Gb
[Noun Extractor] batch prediction was completed for 119705 words
[Noun Extractor] checked compounds. discovered 70639 compounds
[Noun Extractor] postprocessing detaching_features : 109312 -> 92205
[Noun Extractor] postprocessing ignore_features : 92205 -> 91999
[Noun Extractor] postprocessing ignore_NJ : 91999 -> 90643
[Noun Extractor] 90643 nouns (70639 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.581 Gb                    
[Noun Extractor] 76.63 % eojeols are covered


In [23]:
noun_scores = {noun:score[0] for noun, score in nouns.items() if len(noun) > 1}
joblib.dump(noun_scores, 'data/noun_scores.pkl')

['data/noun_scores.pkl']