In [1]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec, Word2Vec
import os
import collections
import smart_open
import random
import subprocess as sp

In [2]:
def read_corpus(fname, sample=1, offset=1, delimiter='\t', drop=False, dropth=0.1):
    sample = max(int(sample), 1)
    offset = max(int(offset), 0)
    with open(fname, 'r') as f:
        for i, line in enumerate(f):
            if drop:
                if random.random() <= dropth:
                    continue
                
            if (i + offset) % sample == 0:
                try:
                    tag, sent = line.split(delimiter)
                    # For training data, add tags
                    yield TaggedDocument(sent.strip().split(), [tag])
                except Exception as err:
                    print(i, line)
                    continue

In [3]:
model = Doc2Vec(
    size=50, window=10,
    negative=10,
    dm_concat=1, dbow_words=1,
    min_count=5, sample=1e-5,
    workers=6
)

In [4]:
file_tokenized = 'dump/doc0909.csv'
train_gr = read_corpus(file_tokenized, sample=1, offset=0)

In [5]:
model.build_vocab(train_gr)

In [6]:
print(model)
print(model.corpus_count)

Doc2Vec(dm/c,d50,n10,w10,mc5,s1e-05,t6)
12869583


In [26]:
%%time
for epoch in range(10):
    model.min_alpha = model.alpha
    tr = read_corpus(file_tokenized, sample=1, offset=0, drop=True, dropth=0.5)
    model.train(tr, total_examples=model.corpus_count, epochs=1)
    model.alpha *= 0.99

CPU times: user 1h 26min 17s, sys: 3min 39s, total: 1h 29min 56s
Wall time: 51min 21s


In [49]:
model.save('title2comment.model')

In [28]:
print('@', model.most_similar(['金城武'], topn=3))
print('@', model.most_similar(['肥宅'], topn=3))
print('@', model.most_similar(['妹妹'], topn=3))
print('@', model.most_similar(['豆導'], topn=3))
print('@', model.most_similar(['安安'], topn=3))
print('@', model.most_similar(['陳水扁'], topn=3))



@ [('吳彥祖', 0.8425476551055908), ('城武', 0.7958974838256836), ('彭于晏', 0.740955650806427)]
@ [('臭酸宅', 0.8624711632728577), ('臭宅', 0.8433394432067871), ('魯肥宅', 0.8200834393501282)]
@ [('姊姊', 0.8106493949890137), ('姐姐', 0.7968341112136841), ('弟弟', 0.7908051013946533)]
@ [('鈕承澤', 0.8874450922012329), ('麥可貝', 0.8463934659957886), ('豆導會', 0.8296818733215332)]
@ [('好久不見', 0.7861751317977905), ('午安', 0.7624630928039551), ('您好', 0.7485469579696655)]
@ [('阿扁', 0.8524560928344727), ('馬英九', 0.8288743495941162), ('謝長廷', 0.7842897176742554)]


In [48]:
# sent = '館長 和 朱雪璋 pk 誰 贏'
sent = '好吃 的 牛肉麵'
inferred_vector = model.infer_vector(sent.split(), steps=100)
# print(inferred_vector)
most_similar = model.docvecs.most_similar([inferred_vector], topn=2)
for tag, score in most_similar:
    cmd = 'grep -P "^{tag}\t" /var/local/marginalbear/dump/doc0909.csv'.format(tag=tag)
    print(tag, score)
    print(sp.run(cmd, shell=True, stdout=sp.PIPE).stdout.decode('utf-8').strip())


175124 0.9503573775291443
175124	老 甕 牛肉麵
175124	超 難吃
165029 0.8852249383926392
165029	剛剛 一碗 半 筋 半 肉牛 麵
165029	我 也 正在 牛肉麵
165029	我 也 吃 牛肉麵


In [41]:
sent1 = '異形 讓 豆導 來 導 會 怎樣 ？'.split()
sent2 = '豆導 拍 異形 會 怎樣 ？'.split()
model.docvecs.similarity_unseen_docs(model, sent1, sent2, steps=50)

0.97330552857231734