In [1]:
#!pip install gensim

In [3]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [4]:
papers_df = pd.read_pickle('papers.pkl')

In [5]:
papers_df.shape

(24256, 8)

In [5]:
papers_df.head(10)

Unnamed: 0,arxiv_id,arxiv_primary_category,authors,pdf_url,published,summary,title,paper_id
0,http://arxiv.org/abs/cs/9308101v1,AI,M. L. Ginsberg,http://arxiv.org/pdf/cs/9308101v1,1993-08-01T00:00:00Z,Because of their occasional need to return to ...,Dynamic Backtracking,1_AI
1,http://arxiv.org/abs/cs/9308102v1,AI,M. P. Wellman,http://arxiv.org/pdf/cs/9308102v1,1993-08-01T00:00:00Z,Market price systems constitute a well-underst...,A Market-Oriented Programming Environment and ...,2_AI
2,http://arxiv.org/abs/cs/9309101v1,AI,"I. P. Gent, T. Walsh",http://arxiv.org/pdf/cs/9309101v1,1993-09-01T00:00:00Z,We describe an extensive study of search in GS...,An Empirical Analysis of Search in GSAT,3_AI
3,http://arxiv.org/abs/cs/9311101v1,AI,"F. Bergadano, D. Gunetti, U. Trinchero",http://arxiv.org/pdf/cs/9311101v1,1993-11-01T00:00:00Z,As real logic programmers normally use cut (!)...,The Difficulties of Learning Logic Programs wi...,4_AI
4,http://arxiv.org/abs/cs/9311102v1,AI,"J. C. Schlimmer, L. A. Hermens",http://arxiv.org/pdf/cs/9311102v1,1993-11-01T00:00:00Z,To support the goal of allowing users to recor...,Software Agents: Completing Patterns and Const...,5_AI
5,http://arxiv.org/abs/cs/9312101v1,AI,"M. Buchheit, F. M. Donini, A. Schaerf",http://arxiv.org/pdf/cs/9312101v1,1993-12-01T00:00:00Z,Terminological knowledge representation system...,Decidable Reasoning in Terminological Knowledg...,6_AI
6,http://arxiv.org/abs/cs/9401101v1,AI,N. Nilsson,http://arxiv.org/pdf/cs/9401101v1,1994-01-01T00:00:00Z,A formalism is presented for computing and org...,Teleo-Reactive Programs for Agent Control,7_AI
7,http://arxiv.org/abs/cs/9402101v1,AI,C. X. Ling,http://arxiv.org/pdf/cs/9402101v1,1994-02-01T00:00:00Z,Learning the past tense of English verbs - a s...,Learning the Past Tense of English Verbs: The ...,8_AI
8,http://arxiv.org/abs/cs/9402102v1,AI,"D. J. Cook, L. B. Holder",http://arxiv.org/pdf/cs/9402102v1,1994-02-01T00:00:00Z,The ability to identify interesting and repeti...,Substructure Discovery Using Minimum Descripti...,9_AI
9,http://arxiv.org/abs/cs/9402103v1,AI,"M. Koppel, R. Feldman, A. M. Segre",http://arxiv.org/pdf/cs/9402103v1,1994-02-01T00:00:00Z,The theory revision problem is the problem of ...,Bias-Driven Revision of Logical Domain Theories,10_AI


In [6]:
summary_data = papers_df['summary'].tolist()

In [7]:
print(len(summary_data))

24256


In [8]:
#List of summary data from all papers
#summary_data

In [8]:
#preprocess text
processed_data = [TaggedDocument(words=word_tokenize(text.lower()), tags=[str(item)]) for item, text in enumerate(summary_data)]

In [9]:
#Initialize model with params :
#    dm = 1 .. to use distributed memory training algo
#.   min_count=1 .. ignore words lower than this frequency   
#    window = 25 length of window size 
#    workers = 5 .. threads for training
model = Doc2Vec(dm =1,window=25,alpha=0.025, min_alpha=0.00025,min_count=1,workers=5)
model.build_vocab(processed_data)

In [10]:
#train model
for i in range(100):
    print("Training epoch : ", i+1)
    model.train(processed_data, total_examples=model.corpus_count,epochs=1)

model.save('nlpmodel')    

Training epoch :  1
Training epoch :  2
Training epoch :  3
Training epoch :  4
Training epoch :  5
Training epoch :  6
Training epoch :  7
Training epoch :  8
Training epoch :  9
Training epoch :  10
Training epoch :  11
Training epoch :  12
Training epoch :  13
Training epoch :  14
Training epoch :  15
Training epoch :  16
Training epoch :  17
Training epoch :  18
Training epoch :  19
Training epoch :  20
Training epoch :  21
Training epoch :  22
Training epoch :  23
Training epoch :  24
Training epoch :  25
Training epoch :  26
Training epoch :  27
Training epoch :  28
Training epoch :  29
Training epoch :  30
Training epoch :  31
Training epoch :  32
Training epoch :  33
Training epoch :  34
Training epoch :  35
Training epoch :  36
Training epoch :  37
Training epoch :  38
Training epoch :  39
Training epoch :  40
Training epoch :  41
Training epoch :  42
Training epoch :  43
Training epoch :  44
Training epoch :  45
Training epoch :  46
Training epoch :  47
Training epoch :  48
T

In [11]:
rec_model = Doc2Vec.load('nlpmodel')
test_text = 'Because of their occasional need to return to shallow points in a search\ntree, existing backtracking methods can sometimes erase meaningful progress\ntoward solving a search problem. In this paper, we present a method by which\nbacktrack points can be moved deeper in the search space, thereby avoiding this\ndifficulty. The technique developed is a variant of dependency-directed\nbacktracking that uses only polynomial space while still providing useful\ncontrol information and retaining the completeness guarantees provided by\nearlier approaches.'
test_title = 'Dynamic Backtracking'
test_category = 'AI'
test_vector = rec_model.infer_vector([test_text.lower()])
similar_papers = rec_model.docvecs.most_similar([test_vector], topn = 10)
print(similar_papers)

[('2297', 0.3084923028945923), ('6470', 0.3019839823246002), ('11096', 0.3008095920085907), ('8206', 0.2832956910133362), ('1081', 0.28075459599494934), ('14552', 0.2792961597442627), ('1380', 0.2749897241592407), ('6358', 0.2732400596141815), ('2922', 0.2723822593688965), ('2674', 0.2711024284362793)]


In [12]:
title_data = papers_df['title'].tolist()
print (title_data)



In [13]:
category_data = papers_df['arxiv_primary_category'].tolist()
print(category_data)

['AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI', 'AI

In [15]:
print("Recommended papers are : ")
for i in similar_papers:
    paperid = int(i[0])
    print(paperid, category_data [paperid-1], title_data[paperid-1])

Recommended papers are : 
2297 AI Plan Recognition in Stories and in Life
6470 NE Guided macro-mutation in a graded energy based genetic algorithm for
  protein structure prediction
11096 SE Efficiently Manifesting Asynchronous Programming Errors in Android Apps
8206 SE IceCube's Development Environment
1081 AI Decision-Theoretic Planning with non-Markovian Rewards
14552 DB Annex: Radon - Rapid Discovery of Topological Relations
1380 AI Methods for computing state similarity in Markov Decision Processes
6358 CV Comparisons of wavelet functions in QRS signal to noise ratio
  enhancement and detection accuracy
2922 AI An argumentation system for reasoning with conflict-minimal
  paraconsistent ALC
2674 AI A Computational Model of Two Cognitive Transitions Underlying Cultural
  Evolution
