In [73]:
from arxiv_record import arxivRecord

In [74]:
import json

with open('arxiv_database.json', 'r') as json_file:
    records = json.load(json_file)

In [52]:
subject_dict = { 'Mathematics - Algebraic Geometry' : 'AG',
 'Mathematics - Algebraic Topology' : 'AT',
 'Mathematics - Analysis of PDEs' : 'AP',
 'Mathematics - Category Theory' : 'CT',
 'Mathematics - Classical Analysis and ODEs' : 'CA',
 'Mathematics - Combinatorics' : 'CO',
 'Mathematics - Commutative Algebra' : 'AC',
 'Mathematics - Complex Variables' : 'CV',
 'Mathematics - Differential Geometry' : 'DG',
 'Mathematics - Dynamical Systems' : 'DS',
 'Mathematics - Functional Analysis' : 'FA',
 'Mathematics - General Mathematics' : 'GM',
 'Mathematics - General Topology' : 'GN',
 'Mathematics - Geometric Topology' : 'GT',
 'Mathematics - Group Theory' : 'GR',
 'Mathematics - History and Overview' : 'HO',
 'Mathematics - K-Theory and Homology' : 'KT',
 'Mathematics - Logic' : 'LO',
 'Mathematics - Metric Geometry' : 'MG',
 'Mathematics - Number Theory' : 'NP',
 'Mathematics - Numerical Analysis' : 'NA',
 'Mathematics - Operator Algebras' : 'OA',
 'Mathematics - Optimization and Control' : 'OC',
 'Mathematics - Probability' : 'PR',
 'Mathematics - Quantum Algebra' : 'QA',
 'Mathematics - Representation Theory' : 'RT',
 'Mathematics - Rings and Algebras' : 'RA',
 'Mathematics - Spectral Theory' : 'SP',
 'Mathematics - Statistics Theory' : 'ST',
 'Mathematics - Symplectic Geometry' : 'SG'}

In [75]:
# change the look of the subjects
for r in records:
    for k in range(len(r['subjects'])):
        r['subjects'][k] = subject_dict[r['subjects'][k]]

In [76]:
# we keep only records which have assigned a math subject and we choose the first one as the main one 
records = [r for r in records if len(r['subjects']) > 0]

for r in records:
    r['subject'] = r['subjects'][0]

In [92]:
set([r['date'][:4] for r in records])

{'2007', '2008', '2009', '2010'}

In [80]:
sa = records[0]['abstract']
all_abstracts = ' '.join([r['abstract'] for r in records])

In [81]:
# we find the most popular words occuring in the abstracts
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

default_stopwords = set(stopwords.words('english'))

words_in_ab = [word.lower() for word in word_tokenize(all_abstracts) if len(word) > 1]
words_in_ab = [word for word in words_in_ab if not word in default_stopwords]
afq = FreqDist(words_in_ab)
most_common_words = [p[0] for p in afq.most_common(200)]

In [82]:
# we prepare the feature set of the records 

def abstract_text_feature(a):
    result = dict()
    for w in most_common_words:
        result[w] = int(w in a)
    return result

feature_set = [(abstract_text_feature(r['abstract']), r['subject']) for r in records]

In [83]:
from random import shuffle

# amount of data we use to train
train_amount = int(len(feature_set)*0.9)

shuffle(feature_set)
train_set = feature_set[:train_amount]
test_set = feature_set[train_amount:]

In [84]:
import nltk

# we train a NaiveBayes classifier to predict the category of the paper 
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [85]:
nltk.classify.accuracy(classifier, test_set)

0.4229940119760479

In [86]:
classifier.show_most_informative_features(20)

Most Informative Features
                category = 1                  CT : CA     =    524.8 : 1.0
                  random = 1                  PR : DG     =    343.1 : 1.0
                  groups = 1                  GR : NA     =    316.5 : 1.0
              projective = 1                  AG : AP     =    237.1 : 1.0
               curvature = 1                  DG : NP     =    206.2 : 1.0
            distribution = 1                  ST : RA     =    204.9 : 1.0
                 quantum = 1                  QA : CV     =    201.7 : 1.0
             topological = 1                  GN : ST     =    198.3 : 1.0
             probability = 1                  PR : AG     =    178.7 : 1.0
                manifold = 1                  SG : AC     =    175.3 : 1.0
               manifolds = 1                  SG : AC     =    170.7 : 1.0
              asymptotic = 1                  ST : AT     =    169.4 : 1.0
                algebras = 1                  RA : PR     =    166.7 : 1.0