In [1]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec, Word2Vec
import os
import collections
import smart_open
import random
import subprocess as sp

In [2]:
def read_corpus(fname, sample=1, offset=1, delimiter='\t', drop=False, dropth=0.1):
    sample = max(int(sample), 1)
    offset = max(int(offset), 0)
    with open(fname, 'r') as f:
        for i, line in enumerate(f):
            if drop:
                if random.random() <= dropth:
                    continue
                
            if (i + offset) % sample == 0:
                try:
                    tag, sent = line.split(delimiter)
                    # For training data, add tags
                    yield TaggedDocument(sent.strip().split(), [tag])
                except Exception as err:
                    print(i, line)
                    continue

In [3]:
model = Doc2Vec(
    size=50, window=10,
    negative=10,
    dm_concat=1, dbow_words=1,
    min_count=5, sample=1e-5,
    workers=6
)

In [5]:
file_tokenized = 'dump/doc0909.csv'
train_gr = read_corpus(file_tokenized, sample=1, offset=0)

In [5]:
model.build_vocab(train_gr)

In [6]:
print(model)
print(model.corpus_count)

Doc2Vec(dm/c,d50,n10,w10,mc5,s1e-05,t6)
12869583


In [3]:
model = Doc2Vec.load('/var/local/marginalbear/doc2vec/title2comment.model')

In [6]:
%%time
for epoch in range(10):
    model.min_alpha = model.alpha
    tr = read_corpus(file_tokenized, sample=1, offset=0, drop=True, dropth=0.5)
    model.train(tr, total_examples=model.corpus_count, epochs=1)
    model.alpha *= 0.99

CPU times: user 1h 20min 33s, sys: 5min 42s, total: 1h 26min 15s
Wall time: 48min 37s


In [61]:
model.save('title2comment.model')

In [64]:
print('@', model.most_similar(['金城武'], topn=3))
print('@', model.most_similar(['肥宅'], topn=3))
print('@', model.most_similar(['妹妹'], topn=3))
print('@', model.most_similar(['豆導'], topn=3))
print('@', model.most_similar(['安安'], topn=3))
print('@', model.most_similar(['陳水扁'], topn=3))



@ [('吳彥祖', 0.8209210634231567), ('郭富城', 0.8141673803329468), ('城武', 0.7772936224937439)]
@ [('宅宅', 0.7863831520080566), ('魯宅', 0.7454927563667297), ('臭宅', 0.7242928147315979)]
@ [('姐姐', 0.8099322319030762), ('弟弟', 0.8085435032844543), ('姊姊', 0.7342876195907593)]
@ [('鈕承澤', 0.8596559762954712), ('豆導會', 0.8199543356895447), ('麥可貝', 0.8190062642097473)]
@ [('好久不見', 0.8013579249382019), ('您好', 0.7441970705986023), ('午安', 0.7305626273155212)]
@ [('馬英九', 0.7975667119026184), ('阿扁', 0.7783585786819458), ('李登輝', 0.7701988816261292)]


In [63]:
# sent = '館長 和 朱雪璋 pk 誰 贏'
# sent = '姐姐 我 喜歡 你'
inferred_vector = model.infer_vector(sent.split(), steps=100)
# print(inferred_vector)
most_similar = model.docvecs.most_similar([inferred_vector], topn=2)
for tag, score in most_similar:
    cmd = 'grep -P "^{tag}\t" /var/local/marginalbear/dump/doc0909.csv'.format(tag=tag)
    print(tag, score)
    print(sp.run(cmd, shell=True, stdout=sp.PIPE).stdout.decode('utf-8').strip())


481100 0.7081737518310547
481100	哪部
481100	你 沒有 姐姐 懂
481100	誰 喇
481100	咪 智障
481100	有沒有 醬 八卦
158913 0.7035507559776306
158913	人渣 到處 劈 腿 滾
158913	姐姐 姐姐 姐姐 姐姐
158913	妹妹 qq
158913	這 一 回合


In [62]:
sent1 = '異形 讓 豆導 來 導 會 怎樣 ？'.split()
sent2 = '豆導 拍 異形 會 怎樣 ？'.split()
model.docvecs.similarity_unseen_docs(model, sent1, sent2, steps=50)

0.98728217607809743