In [1]:
import sys
sys.path.append('./soy/')
import soy


TRAIN_COHESION = True
TOKENIZE = True

corpus_fname = './data/sample_naver_movie/merged_comments.txt'
tokenized_corpus_fname = './data/sample_naver_movie/merged_comments_tokenized.txt'
cohesion_fname = './data/sample_naver_movie/navermovie_cohesion'
maxscore_fname = './data/sample_naver_movie/navermovie_maxscore.pkl'


TRAIN_DOC2VEC = True
doc2vec_fname = './data/sample_naver_movie/movie_review_doc2vec_model.pkl'

id2movie_fname = './data/sample_naver_movie/navermovie_info_idx2moviename.pkl'
id2actor_fname = './data/sample_naver_movie/navermovie_info_idx2actorname.pkl'

## Doc2vec

Doc2Vec을 학습하기 위해서는 각각 문서의 label이 저장되어야 합니다. 이를 위하여 TaggedDocument라는 클래스가 이용됩니다. TaggedDocument는 단어들을 words에, 레이블 정보를 tags에 리스트 형태로 입력합니다

In [2]:
import os
import pickle
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


class CommentDoc2Vec:
    
    def __init__(self, fname):
        self.fname = fname
        if not os.path.isfile(fname):
            print('File not found: %s' % fname)
        
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for doc in f:
                movie_idx, text, score = doc.split('\t')
                yield TaggedDocument(words=text.split(), tags=['MOVIE_%s' % movie_idx])

                
doc2vec_corpus = CommentDoc2Vec(tokenized_corpus_fname)


if TRAIN_DOC2VEC:
    doc2vec_model = Doc2Vec(doc2vec_corpus)
    with open(doc2vec_fname, 'wb') as f:
        pickle.dump(doc2vec_model, f)

else:
    with open(doc2vec_fname, 'rb') as f:
        doc2vec_model = pickle.load(f)

In [3]:
doc2vec_model.most_similar('영화', topn=10)

[('작품', 0.5626058578491211),
 ('블록버스터', 0.5355560183525085),
 ('명화', 0.5216415524482727),
 ('수작', 0.5114963054656982),
 ('블럭버스터', 0.5023318529129028),
 ('영환', 0.4961312413215637),
 ('대작', 0.4729803502559662),
 ('엉화', 0.4661056399345398),
 ('사례', 0.45896366238594055),
 ('재료를', 0.44881749153137207)]

In [4]:
len(doc2vec_model.docvecs)

14

In [5]:
for idx, doctag in sorted(doc2vec_model.docvecs.doctags.items(), key=lambda x:x[1].offset):
    print(idx, doctag)

MOVIE_72523 Doctag(offset=0, word_count=95807, doc_count=10187)
MOVIE_59845 Doctag(offset=1, word_count=145728, doc_count=13095)
MOVIE_109753 Doctag(offset=2, word_count=203777, doc_count=10361)
MOVIE_45321 Doctag(offset=3, word_count=296181, doc_count=26915)
MOVIE_45290 Doctag(offset=4, word_count=780093, doc_count=48273)
MOVIE_47385 Doctag(offset=5, word_count=415014, doc_count=41653)
MOVIE_78726 Doctag(offset=6, word_count=327881, doc_count=20615)
MOVIE_89218 Doctag(offset=7, word_count=390257, doc_count=20080)
MOVIE_47528 Doctag(offset=8, word_count=140321, doc_count=13397)
MOVIE_75397 Doctag(offset=9, word_count=237917, doc_count=16444)
MOVIE_48246 Doctag(offset=10, word_count=237510, doc_count=21912)
MOVIE_140695 Doctag(offset=11, word_count=320033, doc_count=23998)
MOVIE_72054 Doctag(offset=12, word_count=380546, doc_count=25748)
MOVIE_102817 Doctag(offset=13, word_count=26550, doc_count=1864)


In [6]:
doc2vec_model.docvecs.most_similar('MOVIE_59845')

[('MOVIE_140695', 0.33166900277137756),
 ('MOVIE_48246', 0.3011544346809387),
 ('MOVIE_89218', 0.29501283168792725),
 ('MOVIE_78726', 0.2935372292995453),
 ('MOVIE_109753', 0.29165226221084595),
 ('MOVIE_102817', 0.25933074951171875),
 ('MOVIE_75397', 0.22235943377017975),
 ('MOVIE_45321', 0.2216598093509674),
 ('MOVIE_45290', 0.2202378362417221),
 ('MOVIE_72054', 0.1946040391921997)]

In [7]:
doc2vec_model.docvecs.most_similar(1)

[('MOVIE_140695', 0.33166900277137756),
 ('MOVIE_48246', 0.3011544346809387),
 ('MOVIE_89218', 0.29501283168792725),
 ('MOVIE_78726', 0.2935372292995453),
 ('MOVIE_109753', 0.29165226221084595),
 ('MOVIE_102817', 0.25933074951171875),
 ('MOVIE_75397', 0.22235943377017975),
 ('MOVIE_45321', 0.2216598093509674),
 ('MOVIE_45290', 0.2202378362417221),
 ('MOVIE_72054', 0.1946040391921997)]

In [8]:
list(doc2vec_model.docvecs.doctags)

['MOVIE_78726',
 'MOVIE_47385',
 'MOVIE_75397',
 'MOVIE_72054',
 'MOVIE_48246',
 'MOVIE_45290',
 'MOVIE_109753',
 'MOVIE_72523',
 'MOVIE_47528',
 'MOVIE_89218',
 'MOVIE_102817',
 'MOVIE_59845',
 'MOVIE_140695',
 'MOVIE_45321']

In [9]:
list(doc2vec_model.docvecs.doctags.items())[:5]

[('MOVIE_78726', Doctag(offset=6, word_count=327881, doc_count=20615)),
 ('MOVIE_47385', Doctag(offset=5, word_count=415014, doc_count=41653)),
 ('MOVIE_75397', Doctag(offset=9, word_count=237917, doc_count=16444)),
 ('MOVIE_72054', Doctag(offset=12, word_count=380546, doc_count=25748)),
 ('MOVIE_48246', Doctag(offset=10, word_count=237510, doc_count=21912))]

### Doc2Vec 해석하기

In [10]:
import pickle

with open(id2movie_fname, 'rb') as f:
    idx2movie = pickle.load(f)
    
movie2idx = {name:movie_idx for movie_idx, name in idx2movie.items()}

In [11]:
movie2idx

{'측정하다': '43660',
 '후퇴': '27313',
 '정화': '121154',
 '할머니의 시계': '137600',
 '이브와 파이어 호스': '50036',
 '가상성': '87455',
 '메가포스군단': '23977',
 '가쵸 딜로': '27452',
 '다라비 다이어리': '104785',
 '엘레나와 남자들': '94843',
 '블랙메일링 산타': '77028',
 '더 밸리': '113708',
 '드그래시 하이': '76184',
 '극장판 헌터X헌터 : 더 라스트 미션': '119985',
 '톰 쿨': '109495',
 '핸드메이드 위드 러브 프롬 프랑스': '130480',
 '가장 위험한 해': '10552',
 '헤일로 레전드': '80645',
 '하우 투 스타트 유어 오운 컨트리': '78633',
 '모모큔 소드': '127717',
 '그랜드파스 매지컬 토이즈': '120724',
 '기동타격대': '14555',
 '룬': '80199',
 '사랑의 계절': '21339',
 '백전보산하': '73983',
 '메뉴팩쳐링 스토크': '94965',
 '시선의 기억': '99368',
 '천상의 시계': '30880',
 '당신이 고용주라면 시각장애인을 고용하시겠습니까?': '53572',
 '롱 스토리 쇼트': '85273',
 '요나 - 베기테일 무비': '36026',
 '하트 앤 본즈': '87055',
 '해탄적일천': '16064',
 '세여자': '89351',
 '신부일기': '21758',
 '자니 이거': '43244',
 '어느 사진가의 기억': '130032',
 '나폴리, 나폴리, 나폴리': '70625',
 '더 퍼플 어니언': '121687',
 '안나의 랑데부': '29473',
 '데드 맨': '19100',
 '고잉 다운 인 라-라 랜드': '93709',
 '걸즈 걸즈 걸즈': '82007',
 '사뿐히 밟으소서': '54669',
 '축가': '95890',
 '나무소년': '

In [12]:
doc2vec_model.docvecs.doctags

{'MOVIE_102817': Doctag(offset=13, word_count=26550, doc_count=1864),
 'MOVIE_109753': Doctag(offset=2, word_count=203777, doc_count=10361),
 'MOVIE_140695': Doctag(offset=11, word_count=320033, doc_count=23998),
 'MOVIE_45290': Doctag(offset=4, word_count=780093, doc_count=48273),
 'MOVIE_45321': Doctag(offset=3, word_count=296181, doc_count=26915),
 'MOVIE_47385': Doctag(offset=5, word_count=415014, doc_count=41653),
 'MOVIE_47528': Doctag(offset=8, word_count=140321, doc_count=13397),
 'MOVIE_48246': Doctag(offset=10, word_count=237510, doc_count=21912),
 'MOVIE_59845': Doctag(offset=1, word_count=145728, doc_count=13095),
 'MOVIE_72054': Doctag(offset=12, word_count=380546, doc_count=25748),
 'MOVIE_72523': Doctag(offset=0, word_count=95807, doc_count=10187),
 'MOVIE_75397': Doctag(offset=9, word_count=237917, doc_count=16444),
 'MOVIE_78726': Doctag(offset=6, word_count=327881, doc_count=20615),
 'MOVIE_89218': Doctag(offset=7, word_count=390257, doc_count=20080)}

In [13]:
for similar in doc2vec_model.docvecs.most_similar('MOVIE_45290'):
    print(similar)

('MOVIE_72054', 0.5183938145637512)
('MOVIE_47528', 0.38170790672302246)
('MOVIE_89218', 0.2922521233558655)
('MOVIE_75397', 0.259696364402771)
('MOVIE_109753', 0.24901366233825684)
('MOVIE_59845', 0.2202378213405609)
('MOVIE_48246', 0.18558764457702637)
('MOVIE_140695', 0.18136835098266602)
('MOVIE_47385', 0.11665219813585281)
('MOVIE_72523', 0.11536259204149246)


In [14]:
for doctag in doc2vec_model.docvecs.doctags:
    movie_idx = doctag.split('_')[1]
    name = idx2movie.get(movie_idx, -1)
    print(movie_idx, '\t', name)

78726 	 도둑들
47385 	 국가대표
75397 	 미스터 고
72054 	 다크 나이트 라이즈
48246 	 7광구
45290 	 인터스텔라
109753 	 해무
72523 	 고사 두 번째 이야기: 교생실습
47528 	 해리 포터와 죽음의 성물 - 2부
89218 	 베를린
102817 	 해적: 바다로 간 산적
59845 	 박쥐
140695 	 럭키
45321 	 해운대


In [15]:
def id2name(similar):
    idx = similar[0].split('_')[1]
    return (idx2movie.get(idx, 'unknown'), idx, similar[1])


print('인터스텔라\n')

for similar in doc2vec_model.docvecs.most_similar('MOVIE_45290'):
    print(id2name(similar))

인터스텔라

('다크 나이트 라이즈', '72054', 0.5183938145637512)
('해리 포터와 죽음의 성물 - 2부', '47528', 0.38170790672302246)
('베를린', '89218', 0.2922521233558655)
('미스터 고', '75397', 0.259696364402771)
('해무', '109753', 0.24901366233825684)
('박쥐', '59845', 0.2202378213405609)
('7광구', '48246', 0.18558764457702637)
('럭키', '140695', 0.18136835098266602)
('국가대표', '47385', 0.11665219813585281)
('고사 두 번째 이야기: 교생실습', '72523', 0.11536259204149246)


In [16]:
print('국가대표\n')
for similar in doc2vec_model.docvecs.most_similar('MOVIE_47385'):
    print(id2name(similar))

국가대표

('해운대', '45321', 0.7804709672927856)
('미스터 고', '75397', 0.5387098789215088)
('7광구', '48246', 0.3739200830459595)
('해적: 바다로 간 산적', '102817', 0.37021905183792114)
('고사 두 번째 이야기: 교생실습', '72523', 0.32833945751190186)
('해리 포터와 죽음의 성물 - 2부', '47528', 0.2908831238746643)
('럭키', '140695', 0.2793704867362976)
('베를린', '89218', 0.20493870973587036)
('해무', '109753', 0.17311882972717285)
('박쥐', '59845', 0.16962012648582458)


## t-SNE

In [17]:
doc2vec_model.docvecs.doctag_syn0.shape, type(doc2vec_model.docvecs.doctag_syn0)

((14, 100), numpy.ndarray)

In [21]:
index2tag = [doc2vec_model.docvecs.index_to_doctag(i) for i in range(172)]
print(index2tag[:5])

['MOVIE_72523', 'MOVIE_59845', 'MOVIE_109753', 'MOVIE_45321', 'MOVIE_45290']


In [23]:
tag2index = {tag:index for index, tag in enumerate(index2tag)}
tag2index

{14: 14,
 15: 15,
 16: 16,
 17: 17,
 18: 18,
 19: 19,
 20: 20,
 21: 21,
 'MOVIE_102817': 13,
 23: 23,
 24: 24,
 25: 25,
 26: 26,
 27: 27,
 28: 28,
 29: 29,
 30: 30,
 31: 31,
 32: 32,
 33: 33,
 34: 34,
 35: 35,
 36: 36,
 37: 37,
 38: 38,
 39: 39,
 40: 40,
 41: 41,
 42: 42,
 43: 43,
 44: 44,
 45: 45,
 46: 46,
 47: 47,
 48: 48,
 49: 49,
 50: 50,
 51: 51,
 52: 52,
 53: 53,
 54: 54,
 'MOVIE_59845': 1,
 56: 56,
 57: 57,
 58: 58,
 59: 59,
 60: 60,
 61: 61,
 62: 62,
 63: 63,
 64: 64,
 65: 65,
 66: 66,
 67: 67,
 68: 68,
 69: 69,
 70: 70,
 71: 71,
 72: 72,
 73: 73,
 74: 74,
 75: 75,
 76: 76,
 77: 77,
 78: 78,
 79: 79,
 80: 80,
 81: 81,
 82: 82,
 83: 83,
 84: 84,
 85: 85,
 86: 86,
 87: 87,
 88: 88,
 89: 89,
 90: 90,
 91: 91,
 92: 92,
 93: 93,
 94: 94,
 95: 95,
 96: 96,
 97: 97,
 98: 98,
 99: 99,
 100: 100,
 101: 101,
 'MOVIE_72054': 12,
 'MOVIE_140695': 11,
 104: 104,
 105: 105,
 106: 106,
 103: 103,
 108: 108,
 109: 109,
 110: 110,
 111: 111,
 112: 112,
 113: 113,
 114: 114,
 115: 115,
 116: 116

In [22]:
index2name = [idx2movie.get(tag.split('_')[1], -1) for tag in index2tag]
print(index2name[:5])

AttributeError: 'int' object has no attribute 'split'

In [None]:
%%time 

from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
y_tsne = tsne.fit_transform(doc2vec_model.docvecs.doctag_syn0)

y_tsne.shape

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

list(matplotlib.font_manager.get_fontconfig_fonts())[:5]

krfont = {'family' : 'nanumgothic', 'weight' : 'bold', 'size'   : 10}
matplotlib.rc('font', **krfont)

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_tsne[:,0], y_tsne[:,1], color='yellow')

similars = [tag for tag, _ in doc2vec_model.docvecs.most_similar('MOVIE_134963')] + ['MOVIE_134963']

for tag in similars:
    index = tag2index.get(tag, -1)
    if index == -1:
        continue
    name = index2name[index]
    plt.annotate(name, (y_tsne[index, 0], y_tsne[index, 1]))

plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(y_tsne[:,0], y_tsne[:,1], color='yellow')

similars = [tag for tag, _ in doc2vec_model.docvecs.most_similar('MOVIE_93728')] + ['MOVIE_93728']

for tag in similars:
    index = tag2index.get(tag, -1)
    if index == -1:
        continue
    name = index2name[index]
    plt.annotate(name, (y_tsne[index, 0], y_tsne[index, 1]))

plt.show()