### KR-WordRank

https://lovit.github.io/nlp/2018/04/16/krwordrank/

In [None]:
!pip install krwordrank

In [None]:
!pip install kiwipiepy

In [None]:
import pandas as pd

naver_df = pd.read_csv('naver.csv')
naver_df.head()

In [None]:
from kiwipiepy import Kiwi

def text2sentences_kiwi(text):  
    kiwi = Kiwi()
    kiwi_sentences = kiwi.split_into_sents(text)  #text일 때 문장별로 리스트 만듦
    sentences = []
    for kiwi_sentence in kiwi_sentences:
        sentences.append(kiwi_sentence.text)
    return sentences

In [None]:
from krwordrank.hangle import normalize
from krwordrank.word import KRWordRank

words = []
ranks = []

for i, row in naver_df.iterrows():
    
    temp_words = []
    temp_ranks = []
    
    sentences = text2sentences_kiwi(row['Article'])
    sentences = [normalize(sentence, english=True, number=True) for sentence in sentences]
    
    wordrank_extractor = KRWordRank(
        min_count = 5, # 단어의 최소 출현 빈도수 (그래프 생성 시)
        max_length = 10, # 단어의 최대 길이
        verbose = True
    )

    beta = 0.85    # PageRank의 decaying factor beta
    max_iter = 10
    
    try:
        keywords, rank, graph = wordrank_extractor.extract(sentences, beta, max_iter)

    except ValueError as v:
        print('idx: {}번째 기사는 반복되는 단어가 없습니다.\n'.format(i))
        words.append(temp_words)
        ranks.append(temp_ranks)
    else:
        print('idx - ', i)
        for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:5]:
            print('%8s:\t%.4f' % (word, r))
            temp_words.append(word)
            temp_ranks.append(r)
            
        print('')
        words.append(temp_words)
        ranks.append(temp_ranks)

In [None]:
import operator

count_dict = {}

for words in rank_words:
    for word in words:
        if word in count_dict:
            count_dict[word] += 1
        else:
            count_dict[word] = 1
            
sorted_words = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
import pickle

with open('words.pickle', 'wb') as f1:
    pickle.dump(words, f1)

In [None]:
with open('ranks.pickle', 'wb') as f2:
    pickle.dump(ranks, f2)