In [2]:
import numpy as np 
import pandas as pd
import dask.bag as db
import plotly.express as px

In [3]:
import json
import string
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import matplotlib.pyplot as plt

In [4]:
lines=db.read_text("arxiv-metadata-oai-snapshot.json") 

In [4]:
lines.take(2)

('{"id":"0704.0001","submitter":"Pavel Nadolsky","authors":"C. Bal\\\\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan","title":"Calculation of prompt diphoton production cross sections at Tevatron and\\n  LHC energies","comments":"37 pages, 15 figures; published version","journal-ref":"Phys.Rev.D76:013009,2007","doi":"10.1103/PhysRevD.76.013009","report-no":"ANL-HEP-PR-07-12","categories":"hep-ph","license":null,"abstract":"  A fully differential calculation in perturbative quantum chromodynamics is\\npresented for the production of massive photon pairs at hadron colliders. All\\nnext-to-leading order perturbative contributions from quark-antiquark,\\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\\nall-orders resummation of initial-state gluon radiation valid at\\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\\nspecified in which the calculation is most reliable. Good agreement is\\ndemonstrated with data from the Fermilab Tevatro

In [5]:
import json

records=lines.map(lambda x:json.loads(x))

records.take(2)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [6]:
print("Type of First Record After JSON LOADS ",type(lines.take(1)[0]))
print("Type of First Record After JSON LOADS ",type(records.take(1)[0]))

Type of First Record After JSON LOADS  <class 'str'>
Type of First Record After JSON LOADS  <class 'dict'>


In [7]:
records_count=records.count()
records_count

<dask.bag.core.Item at 0x152192c5c70>

In [8]:
print("Number of Records in ArXiv Data is ",records_count.compute())

KeyboardInterrupt: 

In [None]:
ai_category_list=['stat.ML','cs.LG','cs.AI']

In [11]:
ai_docs = (records.filter(lambda x:any(ele in x['categories'] for ele in ai_category_list)==True))

In [12]:
print("Total Papers published in AI&ML ",ai_docs.count().compute())

Total Papers published in AI&ML  91496


In [16]:
extract_latest_version=lambda x:x['versions'][-1]["created"] ## Here -1 indicates the last element in the versions. 
extract_latest_version_year=lambda x:x['versions'][-1]["created"].split(" ")[3]

In [17]:
ai_docs_by_year=ai_docs.map(extract_latest_version_year).frequencies().to_dataframe(columns=['submission_year','num_submissions']).compute()

In [18]:
get_metadata = lambda x: {'id': x['id'],
                  'title': x['title'],
                  'category':x['categories'],
                  'abstract':x['abstract'],
                 'version':x['versions'][-1]['created']}

ai_papers=ai_docs.map(get_metadata).to_dataframe().compute()

In [32]:
ai_papers.to_json("ai_papers.json")

In [17]:
metadata = pd.read_json("ai_papers.json")

In [10]:
metadata.head()

Unnamed: 0,id,title,category,abstract,version
0,704.0047,Intelligent location of simultaneously active ...,cs.NE cs.AI,The intelligent acoustic emission locator is...,"Sun, 1 Apr 2007 13:06:50 GMT"
1,704.005,Intelligent location of simultaneously active ...,cs.NE cs.AI,Part I describes an intelligent acoustic emi...,"Sun, 1 Apr 2007 18:53:13 GMT"
2,704.0304,The World as Evolving Information,cs.IT cs.AI math.IT q-bio.PE,This paper discusses the benefits of describ...,"Wed, 13 Oct 2010 19:49:16 GMT"
3,704.0671,Learning from compressed observations,cs.IT cs.LG math.IT,The problem of statistical learning is to co...,"Thu, 5 Apr 2007 02:57:15 GMT"
4,704.0954,Sensor Networks with Random Links: Topology De...,cs.IT cs.LG math.IT,"In a sensor network, in practice, the commun...","Fri, 6 Apr 2007 21:58:52 GMT"


In [18]:
# remove duplicate records which contain different flags
metadata = metadata.drop(columns = 'category').groupby(by = ['id', 'title', 'abstract'],as_index = False).max()
# remove abstracts from withdrawn records
metadata = metadata[metadata['abstract'].str.contains('paper has been withdrawn') == False]
# lower abstract and remove numbers, punctuation, and special characters
#metadata['abstract'] = [a.strip() for a in metadata['abstract']]
metadata['abstract'] = [a.lower().strip() for a in metadata['abstract']]
metadata['abstract'] = metadata['abstract'].str.replace('\n', ' ', regex = False).str.replace(r'\s\s+', ' ', regex = True)
metadata['abstract'] = metadata['abstract'].str.replace('([.,!?()])', r' \1 ')

In [12]:
metadata['abstract']

0        the intelligent acoustic emission locator is d...
1        part i describes an intelligent acoustic emiss...
2        this paper discusses the benefits of describin...
3        the problem of statistical learning is to cons...
4        in a sensor network ,  in practice ,  the comm...
                               ...                        
91491    we outline the rationale and preliminary resul...
91492    in this article we give several new results on...
91493    in quantum physics ,  a measurement is represe...
91494    probability-like parameters appearing in some ...
91495    graphical models of probabilistic dependencies...
Name: abstract, Length: 91484, dtype: object

In [19]:
### PREPROCESSING FUNCTIONS ###
# lemmatizing function
def lemmatize_text(text_string, lemmatizer):
    word_list = nltk.word_tokenize(text_string)
    # Lemmatize list of words and join
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output
### PREPROCESSING FOR EMBEDDING AND SKLEARN VECTORIZERS ###
metadata['abstract_no_punct'] = metadata['abstract'].str.replace('-', '', regex = False).str.replace(r'\n', ' ', regex = False).str.replace(r'[^a-z ]+', ' ', regex = True).str.replace(r'\s\s+', ' ', regex = True)
# lemmatize
lemmatizer = WordNetLemmatizer()
metadata['abstract_lemmatized'] = [lemmatize_text(ab, lemmatizer) for ab in metadata['abstract']]
# generate formatted stop words + single letters and spelled numbers (expand as necesary)
stopwords_nltk = set([re.sub( r'[^a-z ]+', '',s) for s in stopwords.words('english')] + list(string.ascii_lowercase) + ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])
# exclude stopwords
metadata['abstract_lemmatized_no_stopwords'] = metadata['abstract_lemmatized'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords_nltk]))

In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
metadata.head()

Unnamed: 0,id,title,abstract,version,abstract_no_punct,abstract_lemmatized,abstract_lemmatized_no_stopwords
0,704.0047,Intelligent location of simultaneously active ...,the intelligent acoustic emission locator is d...,"Sun, 1 Apr 2007 13:06:50 GMT",the intelligent acoustic emission locator is d...,the intelligent acoustic emission locator is d...,intelligent acoustic emission locator describe...
1,704.005,Intelligent location of simultaneously active ...,part i describes an intelligent acoustic emiss...,"Sun, 1 Apr 2007 18:53:13 GMT",part i describes an intelligent acoustic emiss...,part i describes an intelligent acoustic emiss...,part describes intelligent acoustic emission l...
2,704.0304,The World as Evolving Information,this paper discusses the benefits of describin...,"Wed, 13 Oct 2010 19:49:16 GMT",this paper discusses the benefits of describin...,this paper discus the benefit of describing th...,paper discus benefit describing world informat...
3,704.0671,Learning from compressed observations,the problem of statistical learning is to cons...,"Thu, 5 Apr 2007 02:57:15 GMT",the problem of statistical learning is to cons...,the problem of statistical learning is to cons...,problem statistical learning construct predict...
4,704.0954,Sensor Networks with Random Links: Topology De...,"in a sensor network , in practice , the comm...","Fri, 6 Apr 2007 21:58:52 GMT",in a sensor network in practice the communicat...,"in a sensor network , in practice , the commun...","sensor network , practice , communication amon..."


In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

In [30]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(metadata['abstract'])

In [20]:
keywords = metadata['abstract_lemmatized_no_stopwords'].tolist()

from nltk.tokenize import word_tokenize
keywords = [word_tokenize(keyword.lower()) for keyword in keywords]

In [21]:
def no_commas(doc):
    no_commas = [t for t in doc if t!=',']
    return(no_commas)

keywords = [no_commas(kw) for kw in keywords]
processed_keywords = keywords

In [22]:
processed_keywords[0]

['intelligent',
 'acoustic',
 'emission',
 'locator',
 'described',
 'part',
 'part',
 'ii',
 'discus',
 'blind',
 'source',
 'separation',
 'time',
 'delay',
 'estimation',
 'location',
 'simultaneously',
 'active',
 'continuous',
 'acoustic',
 'emission',
 'source',
 '.',
 'location',
 'acoustic',
 'emission',
 'complicated',
 'aircraft',
 'frame',
 'structure',
 'difficult',
 'problem',
 'non-destructive',
 'testing',
 '.',
 'article',
 'describes',
 'intelligent',
 'acoustic',
 'emission',
 'source',
 'locator',
 '.',
 'intelligent',
 'locator',
 'comprises',
 'sensor',
 'antenna',
 'general',
 'regression',
 'neural',
 'network',
 'solves',
 'location',
 'problem',
 'based',
 'learning',
 'example',
 '.',
 'locator',
 'performance',
 'wa',
 'tested',
 'different',
 'test',
 'specimen',
 '.',
 'test',
 'shown',
 'accuracy',
 'location',
 'depends',
 'sound',
 'velocity',
 'attenuation',
 'specimen',
 'dimension',
 'tested',
 'area',
 'property',
 'stored',
 'data',
 '.',
 'location