# Doc2Vec example

adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [1]:
import json
from collections import namedtuple

import gensim

Document = namedtuple('Document', 'words tags')

alldocs = []  # will hold all docs in original order
with open('../data/c_twitter.json') as f_in:
    for line in f_in:
        c_twt = json.loads(line)
        # c_twt: {'weeknum': str, 'c_text': str, 'tags': [str]}
        words = c_twt['c_text'].split()
        tags = [c_twt['index']] + c_twt['tags']
        alldocs.append(Document(words, tags))
        
shuffle_docs = alldocs[:]


In [None]:
import multiprocessing

from gensim.models import Doc2Vec
import gensim.models.doc2vec


cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
]

# currently running out of memory
simple_models[0].build_vocab(alldocs)
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)


Doc2Vec(dm/c,d100,n5,w5,mc2,t6)
Doc2Vec(dbow,d100,n5,mc2,t6)
Doc2Vec(dm/m,d100,n5,w10,mc2,t6)


In [None]:
from random import shuffle

passes = 10

for epoch in range(passes):
    shuffle(shuffle_docs)
    for model in simple_models:
        model.train(shuffle_docs)
        print(epoch, model.most_similar('hiv', topn=10))
        print("---similarity score (should increase)",
              (model.n_similarity('hiv', 'Hiv') - model.n_similarity('hiv', 'carrots'))
              )

In [None]:
# do visualization/clustering of users
# do visualization/clustering of hashtags
# do visualization/clustering of documents