# Introduction

Data Source: https://www.kaggle.com/datasets/Cornell-University/arxiv

## Imports

In [1]:
import numpy as np
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import requests

In [2]:
file_path = 'Input/arxiv-metadata-oai-snapshot.json'

## Load Data from Json

In [3]:
def get_research_paper(data_file):
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [4]:
test_paper = get_research_paper(file_path)

In [5]:
for each_paper_data in test_paper:
    dict_paper_data = json.loads(each_paper_data)
    print(dict_paper_data)
#     print(dict_paper_data.get('categories'))
#     print(pd.to_datetime(dict_paper_data.get('update_date')).year)
    break

{'id': '0704.0001', 'submitter': 'Pavel Nadolsky', 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan", 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies', 'comments': '37 pages, 15 figures; published version', 'journal-ref': 'Phys.Rev.D76:013009,2007', 'doi': '10.1103/PhysRevD.76.013009', 'report-no': 'ANL-HEP-PR-07-12', 'categories': 'hep-ph', 'license': None, 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab

Only need: title, categories, abstract, id, journal-ref, authors_parsed

Categories are available here - https://arxiv.org/category_taxonomy

## Get Category Codes

In [6]:
r = requests.get('https://arxiv.org/category_taxonomy')
soup = bs(r.content)

dict_categories_descriptions = {}
for each_cat in soup.find_all(attrs={"class" : "column is-one-fifth"}):
    try:
        _text = each_cat.h4.text
        _key = _text.split(' (')[0]
        _value = _text.split(' (')[1].replace(')', '')
        dict_categories_descriptions[_key] = _value
    except:
        pass



In [7]:
df_categories_metadata = pd.DataFrame()

In [8]:
df_categories_metadata['cat_code'] = list(dict_categories_descriptions.keys())
df_categories_metadata['cat_desc'] = list(dict_categories_descriptions.values())

In [9]:
cat_map = {
    'cs.': 'Computer Science',
    'econ.': 'Economics',
    'eess.' : 'Electrical and Electronics',
    'math.' : 'Mathematics',
    'q-bio.': 'Quantitative Biology',
    'q-fin.': 'Quantitative Finance',
    'stat.' : 'Statistics'
    # None of these -> Physics
}

In [10]:
def get_parent_categories(x):
    """
    Check if category starts with any of the keys listed,
    if not, return Physics since it has multiple start codes.
    """
    for each_cat in list(cat_map.keys()):
        if x.startswith(each_cat):
            return cat_map[each_cat]
    return 'Physics'

In [11]:
df_categories_metadata['parent_cat']  = df_categories_metadata['cat_code'].apply(get_parent_categories)
df_categories_metadata

Unnamed: 0,cat_code,cat_desc,parent_cat
0,cs.AI,Artificial Intelligence,Computer Science
1,cs.AR,Hardware Architecture,Computer Science
2,cs.CC,Computational Complexity,Computer Science
3,cs.CE,"Computational Engineering, Finance, and Science",Computer Science
4,cs.CG,Computational Geometry,Computer Science
...,...,...,...
150,stat.CO,Computation,Statistics
151,stat.ME,Methodology,Statistics
152,stat.ML,Machine Learning,Statistics
153,stat.OT,Other Statistics,Statistics


In [12]:
df_categories_metadata.to_csv('Outputs/arxiv_cat_codes.csv', index=False)

In [13]:
all_cs_categories = list(df_categories_metadata[df_categories_metadata['cat_code'].str.startswith('cs.')]['cat_code'].str.strip())
print(all_cs_categories)

['cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY']


## Corpus Date Range Analysis 

In [14]:
list_dates = []
_paper = get_research_paper(file_path)
for _data in _paper:
    _dict = json.loads(_data)
    list_dates.append(_dict['update_date'])

In [15]:
class DatasetInfo:
    total_length = len(list_dates)
    min_date = pd.to_datetime(min(list_dates))
    max_date= pd.to_datetime(max(list_dates))

## DataClass for ResearchPaper Objects

In [16]:
from dataclasses import dataclass
from dataclasses import field

@dataclass
class ResearchPaper:
    res_paper_id: int
    category: list
    title: str
    abstract: str
    list_authors: list
    updated_date: pd.Timestamp
    updated_year: int

In [17]:
max_year = DatasetInfo.max_date.year
max_year - 5

2018

In [18]:
research_papers = []
paper_data = get_research_paper(file_path)
for each_paper_data in paper_data:
    dict_paper_data = json.loads(each_paper_data)
    # Get data for last 5 years and in Category Computer Science
    max_year = DatasetInfo.max_date.year
    start_year = max_year - 5
    updated_date = pd.to_datetime(dict_paper_data.get('update_date'))
    categories = dict_paper_data.get('categories')
    list_categories = set(categories.split(' '))
    if  updated_date.year >= start_year and bool(set(list_categories).intersection(set(all_cs_categories))):
        try:
            rp_obj = ResearchPaper(res_paper_id = dict_paper_data.get('id'),
                                   category = list_categories,
                                   title = dict_paper_data.get('title'),                               
                                   abstract = dict_paper_data.get('abstract'),
                                   list_authors = dict_paper_data.get('authors_parsed'),
                                   updated_date = updated_date,
                                   updated_year = updated_date.year
                                  )
            research_papers.append(rp_obj)
        except:
            pass

In [19]:
# from operator import attrgetter
# max(research_papers, key=attrgettera('updated_date'))
# min(research_papers, key=attrgetter('updated_date'))

In [20]:
df = pd.DataFrame(research_papers)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357620 entries, 0 to 357619
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   res_paper_id  357620 non-null  object        
 1   category      357620 non-null  object        
 2   title         357620 non-null  object        
 3   abstract      357620 non-null  object        
 4   list_authors  357620 non-null  object        
 5   updated_date  357620 non-null  datetime64[ns]
 6   updated_year  357620 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 19.1+ MB


In [23]:
df.to_pickle('Outputs/cs_papers_2018_2023.pkl')