# Doc2Vec example

adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [1]:
import json
from collections import namedtuple

import gensim

Document = namedtuple('Document', 'words tags')

alldocs = []  # will hold all docs in original order
with open('../data/c_twitter.json') as f_in:
    for line_no, line in enumerate(f_in):
        c_twt = json.loads(line)
        # c_twt: {'weeknum': str, 'c_text': str, 'tags': [str]}
        words = c_twt['c_text'].split()
        tags = [line_no] + c_twt['tags']
        alldocs.append(Document(words, tags))


In [2]:
import multiprocessing

from gensim.models import Doc2Vec
import gensim.models.doc2vec


cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
    # PV-DBOW 
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000),
]

# currently running out of memory
simple_models[0].build_vocab(alldocs)
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)


Doc2Vec(dm/c,d100,n5,w5,mc2,t6)
Doc2Vec(dbow,d100,n5,mc2,t6)
Doc2Vec(dm/m,d100,n5,w10,mc2,t6)


In [3]:
passes = 3

for epoch in range(passes):
    for model in simple_models:
        model.train(alldocs)
        print(epoch, model.most_similar('hiv', topn=10))

0 [('Girlfriend', 0.38768646121025085), ('Identify', 0.38360071182250977), ('amfAR', 0.33817023038864136), ('govt', 0.32246994972229004), ('literally', 0.29881635308265686), ('Way', 0.2954923212528229), ('Im', 0.29524320363998413), ("'Whine'", 0.29112979769706726), ('clinics', 0.2860417068004608), ('half', 0.2801520824432373)]
0 [('Girlfriend', 0.38768646121025085), ('Identify', 0.38360071182250977), ('amfAR', 0.33817023038864136), ('govt', 0.32246994972229004), ('literally', 0.29881635308265686), ('Way', 0.2954923212528229), ('Im', 0.29524320363998413), ("'Whine'", 0.29112979769706726), ('clinics', 0.2860417068004608), ('half', 0.2801520824432373)]
0 [('rabies', 0.7555388808250427), ('syphilis', 0.7541437745094299), ('HIV', 0.747816264629364), ('gonorrhea', 0.7270178198814392), ('pneumonia', 0.7034633159637451), ('chlamydia', 0.6978621482849121), ('meningitis', 0.6813951134681702), ('coldflu', 0.6686193346977234), ('Hiv', 0.661539614200592), ('flu-', 0.658343493938446)]
1 [('Girlfrien

In [4]:
# do visualization/clustering of users
# do visualization/clustering of hashtags
# do visualization/clustering of documents