In [1]:
from pymongo.mongo_client import MongoClient
import gensim

In [2]:
client = MongoClient()
collection = client.arxiv.articles

In [12]:
docs = []
for item in collection.find():
    category = item['arxiv_primary_category']['term']
#     words = gensim.utils.simple_preprocess(category + ' ' + item['summary'])
    content = ' '.join([item['title'], item['author'], item['summary']])
    words = gensim.utils.simple_preprocess(content)
    tags = [item['id'], category, item['author']] 
    docs.append(gensim.models.doc2vec.TaggedDocument(words, tags))

In [24]:
model = gensim.models.doc2vec.Doc2Vec(docs, workers=4)

In [28]:
model.save('../models/doc2vec_22k_cat_author_id')

In [25]:
model.docvecs.doctags

{'Joshua Zahl': Doctag(index=37145, word_count=172, doc_count=1),
 'http://arxiv.org/abs/1305.3252v2': Doctag(index=39180, word_count=177, doc_count=1),
 'Alexander S. Sorin': Doctag(index=34036, word_count=152, doc_count=1),
 'Arjen van Vliet': Doctag(index=19083, word_count=131, doc_count=1),
 'Eustrat Zhupa': Doctag(index=15019, word_count=74, doc_count=1),
 'Marcos Amaku': Doctag(index=26486, word_count=180, doc_count=1),
 'Roberto Imbuzeiro Oliveira': Doctag(index=33698, word_count=156, doc_count=1),
 'http://arxiv.org/abs/1510.06679v1': Doctag(index=2877, word_count=118, doc_count=1),
 'http://arxiv.org/abs/1404.6576v2': Doctag(index=21242, word_count=201, doc_count=1),
 'Ngai-Ching Wong': Doctag(index=21428, word_count=76, doc_count=1),
 'Elena R. Loubenets': Doctag(index=31108, word_count=310, doc_count=2),
 'Günter M. Ziegler': Doctag(index=1765, word_count=300, doc_count=3),
 'Bennett D. Marshall': Doctag(index=4846, word_count=135, doc_count=1),
 'http://arxiv.org/abs/1407.4

In [27]:
model.docvecs.most_similar(positive=['Joshua Zahl'])

[('http://arxiv.org/abs/1407.5705v2', 0.40248221158981323),
 ('Rom Pinchasi', 0.3270086348056793),
 ('http://arxiv.org/abs/1212.6469v1', 0.3254002332687378),
 ('Sergey Sazonov', 0.30794307589530945),
 ('Valentino Tosatti', 0.3076821565628052),
 ('http://arxiv.org/abs/1506.08262v2', 0.3061351180076599),
 ('http://arxiv.org/abs/1401.4797v4', 0.3058221936225891),
 ('http://arxiv.org/abs/1401.4372v1', 0.30545973777770996),
 ('Jair Taylor', 0.3029690980911255),
 ('http://arxiv.org/abs/1409.4400v2', 0.3021334409713745)]

In [30]:
vector_vec = model.infer_vector('vector')

In [31]:
model.most_similar(positive=[vector_vec])

[('classifies', 0.22874610126018524),
 ('meanwhile', 0.22553092241287231),
 ('showcase', 0.22311025857925415),
 ('intervening', 0.21904277801513672),
 ('leveraged', 0.1990986168384552),
 ('annuity', 0.19818773865699768),
 ('solve', 0.1974664032459259),
 ('ungauged', 0.19345450401306152),
 ('announced', 0.19281189143657684),
 ('leffler', 0.1927436888217926)]

In [29]:
ai_related = model.docvecs.most_similar(positive=['q-fin.CP'], topn=200)
ai_related

[('Tomaso Aste', 0.686176061630249),
 ('stat.OT', 0.6859711408615112),
 ('cs.GR', 0.685437023639679),
 ('stat.CO', 0.6844007968902588),
 ('q-fin.RM', 0.6785334348678589),
 ('q-fin.TR', 0.6756480932235718),
 ('q-fin.PM', 0.6703044772148132),
 ('http://arxiv.org/abs/1401.1457v2', 0.6671109199523926),
 ('cs.CE', 0.6623086929321289),
 ('Ali H. Sayed', 0.65728759765625),
 ('q-fin.GN', 0.6509248614311218),
 ('cs.MS', 0.6414000988006592),
 ('cs.RO', 0.6395535469055176),
 ('cs.PL', 0.6346538662910461),
 ('cs.OH', 0.6334890127182007),
 ('cs.MM', 0.6281592845916748),
 ('http://arxiv.org/abs/1412.5332v2', 0.6265527009963989),
 ('physics.data-an', 0.6242257356643677),
 ('stat.AP', 0.6237746477127075),
 ('Mikhail Rozhkov', 0.6106544733047485),
 ('Francois Septier', 0.6095578670501709),
 ('http://arxiv.org/abs/1504.05806v1', 0.6066292524337769),
 ('stat.ML', 0.6044270992279053),
 ('http://arxiv.org/abs/1505.02070v1', 0.6010339260101318),
 ('q-bio.QM', 0.6008787155151367),
 ('http://arxiv.org/abs/151

In [52]:
ai_related_not_in_cs = [(cat,sim) for cat,sim in ai_related if not cat.startswith('cs')]
ai_related_not_in_cs

[('stat.ML', 0.8434848785400391),
 ('stat.ME', 0.7796870470046997),
 ('q-fin.PM', 0.7789666652679443),
 ('stat.CO', 0.7604587078094482),
 ('math.OC', 0.7492505311965942),
 ('q-fin.RM', 0.7398270964622498),
 ('q-bio.MN', 0.728412389755249),
 ('stat.AP', 0.7281472682952881),
 ('math.ST', 0.7264813780784607),
 ('q-bio.QM', 0.7107497453689575),
 ('q-bio.NC', 0.643532395362854),
 ('q-fin.CP', 0.6434652805328369),
 ('physics.soc-ph', 0.6190789937973022),
 ('q-bio.PE', 0.6080213189125061),
 ('q-bio.GN', 0.6000803112983704),
 ('q-fin.PR', 0.5744031071662903),
 ('stat.OT', 0.5644927024841309),
 ('physics.data-an', 0.5632250905036926),
 ('q-fin.GN', 0.5381014943122864),
 ('physics.ed-ph', 0.5378150343894958),
 ('q-fin.EC', 0.5174562335014343),
 ('q-fin.TR', 0.47576916217803955),
 ('math.NA', 0.47322243452072144),
 ('math.HO', 0.46594637632369995),
 ('physics.comp-ph', 0.4648459255695343),
 ('q-fin.ST', 0.4260466694831848),
 ('q-fin.MF', 0.41514381766319275),
 ('q-bio.OT', 0.39239686727523804),
 

In [48]:
def most_similar_category(cat, in_other_cat=None):
    relateds = model.docvecs.most_similar(cat, topn=200)
    if in_other_cat is not None:
        return [(cat,sim) for cat,sim in relateds if cat.startswith(in_other_cat)]
    return relateds