In [2]:
data_path = '../data/processed/processed_review_movieid.txt'
corpus = [sent for sent in open(data_path, 'r', encoding='utf8')]
corpus[:3]

['종합 평점은 4점 드립니다.␞92575\n',
 '원작이 칭송받는 이유는 웹툰 계 자체의 질적 저하가 심각하기 때문.  원작이나 영화나 별로인건 마찬가지.␞92575\n',
 '나름의  감동도 있고 안타까운 마음에 가슴도 먹먹  배우들의 연기가 good 김수현 최고~␞92575\n']

In [5]:
from konlpy.tag import Okt, Komoran, Hannanum, Kkma
from eunjeon import Mecab

def get_tokenizer(tokenizer_name):

    tokenizer_dict={
        'komoran':Komoran(),
        'okt':Okt(),
        'mecab':Mecab(),
        'hannanum':Hannanum(),
        'kkma':Kkma()
    }
    try:
        tokenizer=tokenizer_dict[tokenizer_name]
    except:
        tokenizer=Mecab()
    return tokenizer

In [18]:
from gensim.models.doc2vec import TaggedDocument

class Doc2VecInput:
    
    def __init__(self, fname, tokenizer_name='mecab'):
        self.fname = fname
        self.tokenizer = get_tokenizer(tokenizer_name)
        
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for line in f:
                try:
                    sentence, movie_id = line.strip().split('\u241E')
                    tokens = self.tokenizer.morphs(sentence)
                    tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id])
                    yield tagged_doc
                except:
                    continue

In [19]:
from gensim.models import Doc2Vec

corpus_fname = '../data/processed/processed_review_movieid.txt'
output_fname = '../data/sentence-embeddings/doc2vec/doc2vec.model'
corpus = Doc2VecInput(corpus_fname)

In [20]:
model = Doc2Vec(corpus, dm=1, vector_size=100)

In [23]:
model.save(output_fname)

In [24]:
model.docvecs.most_similar('MOVIE_92575')

[('MOVIE_38921', 0.7221782207489014),
 ('MOVIE_61115', 0.7028977870941162),
 ('MOVIE_99752', 0.6768426895141602),
 ('MOVIE_50097', 0.6743828058242798),
 ('MOVIE_48238', 0.6741533279418945),
 ('MOVIE_11234', 0.6688286066055298),
 ('MOVIE_17156', 0.6674063205718994),
 ('MOVIE_101441', 0.6626647710800171),
 ('MOVIE_123082', 0.6596935987472534),
 ('MOVIE_120057', 0.6567031145095825)]

In [36]:
import requests
import random

class Doc2VecEvaluator:

    def __init__(self, model_fname="data/doc2vec.vecs", use_notebook=False):
        self.model = Doc2Vec.load(model_fname)
        self.doc2idx = {el:idx for idx, el in enumerate(self.model.docvecs.doctags.keys())}
        self.use_notebook = use_notebook

    def most_similar(self, movie_id, topn=10):
        similar_movies = self.model.docvecs.most_similar('MOVIE_' + str(movie_id), topn=topn)
        for movie_id, score in similar_movies:
            print(self.get_movie_title(movie_id), score)

    def get_titles_in_corpus(self, n_sample=5):
        movie_ids = random.sample(self.model.docvecs.doctags.keys(), n_sample)
        return {movie_id: self.get_movie_title(movie_id) for movie_id in movie_ids}

    def get_movie_title(self, movie_id):
        url = 'http://movie.naver.com/movie/point/af/list.nhn?st=mcode&target=after&sword=%s' % movie_id.split("_")[1]
        resp = requests.get(url)
        root = html.fromstring(resp.text)
        try:
            title = root.xpath('//div[@class="choice_movie_info"]//h5//a/text()')[0]
        except:
            title = ""
        return title

In [37]:
model_eval = Doc2VecEvaluator(output_fname)

In [40]:
model_eval.get_titles_in_corpus(n_sample=3)

{'MOVIE_13962': '특공외인부대',
 'MOVIE_74522': '소와 함께 여행하는 법',
 'MOVIE_134207': '아메리칸 섹스파티'}

In [42]:
model_eval.most_similar(13962, topn=5)

이야기 도시 0.9439024925231934
유닛 7 0.9423443675041199
바람아 말하라 0.940601110458374
7층 0.940226674079895
하늘 나라 엄마 별이 0.9396483302116394
