In [1]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec, Word2Vec
import os
import collections
import smart_open
import random
import subprocess as sp

In [2]:
def read_corpus(fname, sample=1, offset=1, delimiter='\t', drop=False, dropth=0.1):
    sample = max(int(sample), 1)
    offset = max(int(offset), 0)
    with open(fname, 'r') as f:
        for i, line in enumerate(f):
            if drop:
                if random.random() <= dropth:
                    continue
                
            if (i + offset) % sample == 0:
                try:
                    tag, sent = line.split(delimiter)
                    # For training data, add tags
                    yield TaggedDocument(sent.strip().split(), [tag])
                except Exception as err:
                    print(i, line)
                    continue

In [3]:
model = Doc2Vec(
    size=50, window=10,
    negative=10,
    dm_concat=1, dbow_words=1,
    min_count=5, sample=1e-5,
    workers=6
)

In [4]:
file_tokenized = 'dump/doc0909.csv'
train_gr = read_corpus(file_tokenized, sample=1, offset=0)

In [5]:
model.build_vocab(train_gr)

In [6]:
print(model)
print(model.corpus_count)

Doc2Vec(dm/c,d50,n10,w10,mc5,s1e-05,t6)
12869583


In [None]:
%%time
for epoch in range(10):
    model.min_alpha = model.alpha
    tr = read_corpus(file_tokenized, sample=1, offset=0, drop=True, dropth=0.5)
    model.train(tr, total_examples=model.corpus_count//2, epochs=1)
    model.alpha *= 0.99

In [None]:
model.save('title2comment.model')

In [None]:
print('@', model.most_similar(['金城武']))
print('@', model.most_similar(['肥宅']))
print('@', model.most_similar(['妹妹']))
print('@', model.most_similar(['豆導']))
print('@', model.most_similar(['安安']))
print('@', model.most_similar(['陳水扁']))



In [None]:
sent = '館長 和 朱雪璋 PK 誰 贏'
inferred_vector = model.infer_vector(sent.split(), alpha=0.001, min_alpha=0.0001)
# print(inferred_vector)
most_similar = model.docvecs.most_similar([inferred_vector])
for tag, score in most_similar:
    cmd = 'grep -P "^{tag}\t" /var/local/okbot/tokenized0605.csv'.format(tag=tag)
    print(tag, score)
    print(sp.run(cmd, shell=True, stdout=sp.PIPE).stdout.decode('utf-8').strip())


In [None]:
# sent = '有 無 金城武 是 八卦 最好 釣 的 名人 的 八卦 ?'
# model.docvecs.similarity(2, 3)
# indexies = model.docvecs.indexed_doctags(['420194', '421657', '437505', '472723', '433785', '484674', '523031'])[0]
# ref, test = indexies[0], indexies[1:]

# [model.docvecs.similarity(ref, i) for i in test]
sent1 = '異形 讓 豆導 來 導 會 怎樣 ？'.split()
sent2 = '豆導 拍 異形 會 怎樣 ？'.split()
model.docvecs.similarity_unseen_docs(model, sent1, sent2, alpha=0.001, min_alpha=0.0001)