# Doc2Vec example

adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [None]:
import json
from collections import namedtuple

import gensim

Document = namedtuple('Document', 'words tags')

alldocs = []  # will hold all docs in original order
with open('../data/c_twitter.json') as f_in:
    for line_no, line in enumerate(f_in):
        c_twt = json.loads(line)
        # c_twt: {'weeknum': str, 'c_text': str, 'tags': [str]}
        words = c_twt['c_text'].split()
        tags = [line_no] + c_twt['tags']
        alldocs.append(Document(words, tags))


In [None]:
import multiprocessing

from gensim.models import Doc2Vec
import gensim.models.doc2vec


cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
]

# currently running out of memory
simple_models[0].build_vocab(alldocs)
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)


In [None]:
passes = 3

for epoch in range(passes):
    for model in simple_models:
        model.train(alldocs)
        print(epoch, model.most_similar('hiv', topn=10))

In [None]:
# do visualization/clustering of users
# do visualization/clustering of hashtags
# do visualization/clustering of documents