In [1]:
import sys
sys.path.append('./soy/')
import soy


TRAIN_COHESION = True
TOKENIZE = True

corpus_fname = './data/sample_naver_movie/merged_comments.txt'
tokenized_corpus_fname = './data/sample_naver_movie/merged_comments_tokenized.txt'
cohesion_fname = './data/sample_naver_movie/navermovie_cohesion'
maxscore_fname = './data/sample_naver_movie/navermovie_maxscore.pkl'


TRAIN_WORD2VEC = True
word2vec_fname = './data/sample_naver_movie/movie_review_word2vec_model.pkl'


TRAIN_DOC2VEC = True
doc2vec_fname = './data/sample_naver_movie/movie_review_doc2vec_model.pkl'

id2movie_fname = './data/sample_naver_movie/navermovie_info_idx2moviename.pkl'
id2actor_fname = './data/sample_naver_movie/navermovie_info_idx2actorname.pkl'

In [2]:
def get_text(fname):
    with open(fname, encoding='utf-8') as f:
        docs = [doc.strip().split('\t') for doc in f]
    
    idx, texts, scores = zip(*docs)
    return idx, texts, scores

idx, docs, scores = get_text(corpus_fname)
docs[:5]

('명불허전',
 '왠지 고사 피의중간고사보다 재미가없을듯해요 만약보게된다면실망할듯',
 '티아라 사랑해 ㅜ',
 '황정음 윤시윤 지붕킥 인연 김수로 티아라지연 공부의신 인연 너무너무재미있어요',
 '기대 완전')

In [3]:
len(docs)

3280685

In [4]:
import pickle
from soy.nlp.extractors import CohesionProbability

cohesion = CohesionProbability()

if TRAIN_COHESION:
    cohesion.train(docs)
    cohesion.save(cohesion_fname)
    
    scores = cohesion.get_all_cohesion_probabilities()
    scores = {word:s[0] for word, s in scores.items() if s[2] >= 10 and s[0] > 0.001}
    with open(maxscore_fname, 'wb') as f:
        pickle.dump(scores, f)
        
else:
    cohesion.load(cohesion_fname)
    with open(maxscore_fname, 'rb') as f:
        scores = pickle.load(f)

In [5]:
from soy.nlp.tokenizer import MaxScoreTokenizer

tokenizer = MaxScoreTokenizer(scores=scores)

for doc in docs[:3]:
    print(doc)
    print(tokenizer.tokenize(doc), '\n')

if TOKENIZE:
    with open(tokenized_corpus_fname, 'w', encoding='utf-8') as f:
        for idx_, text, score in zip(idx, docs, scores):
            tokens = ' '.join(tokenizer.tokenize(text)).strip()
            f.write('%s\t%s\t%s\n' % (idx_, tokens, score))
            
    print('tokenizing was done')

명불허전
['명불허전'] 

왠지 고사 피의중간고사보다 재미가없을듯해요 만약보게된다면실망할듯
['왠지', '고사', '피의', '중간', '고사', '보다', '재미', '가', '없을', '듯해요', '만약', '보게', '된다', '면', '실망', '할듯'] 

티아라 사랑해 ㅜ
['티아라', '사랑', '해', 'ㅜ'] 

tokenizing was done


In [6]:
import os
    
class CommentWord2Vec:
    
    def __init__(self, fname):
        self.fname = fname
        if not os.path.isfile(fname):
            print('File not found: %s' % fname)
        
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for doc in f:
                movie_idx, text, score = doc.split('\t')
                yield text.split()
                
                
word2vec_corpus = CommentWord2Vec(tokenized_corpus_fname)

for num_doc, doc in enumerate(word2vec_corpus):
    if num_doc > 5: break
    print(doc)

['명불허전']
['왠지', '고사', '피의', '중간', '고사', '보다', '재미', '가', '없을', '듯해요', '만약', '보게', '된다', '면', '실망', '할듯']
['티아라', '사랑', '해', 'ㅜ']
['황정', '음', '윤시윤', '지붕킥', '인연', '김수', '로', '티아라', '지연', '공부', '의신', '인연', '너무', '너무', '재미', '있어', '요']
['기대', '완전']
['기대', '지연', '나온다']


## word2vec

Gensim의 Word2Vec을 이용합니다. 미리 만들어둔 word2vec_corpus를 Word2Vec의 argument로 입력합니다. default parameters를 이용하여 Word2Vec을 학습힙니다. 

Word2Vec의 arguments 중에서 중요한 것들은 아래와 같습니다. 

- size: 단어의 임베딩 공간의 크기
- alpha: learning rate
- window: 한 단어의 좌/우의 문맥 크기
- min_count: 모델이 학습할 단어의 최소 출현 빈도수
- max_vocab_size: None이 아닌 숫자를 입력하면 빈도수 기준으로 상위 max_vocab_size 개수만큼의 단어만 학습
- sg: 1이면 skipgram 이용
- negative: negative sampling에서 negative sample의 개수

In [7]:
word2vec_corpus

<__main__.CommentWord2Vec at 0x10e5c95c0>

In [8]:
from gensim.models import Word2Vec
import pickle

if TRAIN_WORD2VEC:
    word2vec_model = Word2Vec(word2vec_corpus)
    with open(word2vec_fname, 'wb') as f:
        pickle.dump(word2vec_model, f)
        
else:
    with open(word2vec_fname, 'rb') as f:
        word2vec_model = pickle.load(f)

학습된 Word2Vec모델의 .most_similar(단어, topn) 함수는 입력된 단어에 대하여 가장 비슷한 topn개의 다른 단어들과 유사도를 출력합니다. 

아래의 에제에서 '영화'와 가장 비슷한 단어는 '애니'이며, 유사도는 0.7270입니다

In [9]:
if TRAIN_WORD2VEC:
    word2vec_model = Word2Vec(word2vec_corpus)
    with open(word2vec_fname, 'wb') as f:
        pickle.dump(word2vec_model, f)
        
else:
    with open(word2vec_fname, 'rb') as f:
        word2vec_model = pickle.load(f)

In [10]:
word2vec_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x10e5be320>

In [11]:
word2vec_model.wv.most_similar("영화", topn=10)

[('작품', 0.6003116369247437),
 ('블록버스터', 0.5519367456436157),
 ('결과', 0.5387158393859863),
 ('장르', 0.5341578125953674),
 ('소재', 0.48169851303100586),
 ('나에게', 0.4721364378929138),
 ('시리즈', 0.46323785185813904),
 ('대작', 0.4631398916244507),
 ('건', 0.4575556516647339),
 ('주제', 0.4552241563796997)]

In [12]:
word2vec_model.most_similar('하정우', topn=10)

[('성동일', 0.7917931079864502),
 ('임달화', 0.7671147584915161),
 ('송강호', 0.7656506896018982),
 ('김윤석', 0.7547394037246704),
 ('이민기', 0.746239423751831),
 ('조윤희', 0.730881929397583),
 ('한예리', 0.712838888168335),
 ('김해숙', 0.705826461315155),
 ('유해진', 0.700925350189209),
 ('전지현', 0.6980483531951904)]