In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/Colab Notebooks/Part2

/content/drive/MyDrive/Colab Notebooks/Part2


In [None]:
import numpy as np
import pandas as pd
import os, json, gc, re, random

In [None]:
data_file = 'arxiv-metadata-oai-snapshot.json'

""" Using `yield` to load the JSON file in a loop to prevent Python memory issues if JSON is loaded directly"""

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

metadata = get_metadata()
for paper in metadata:
    for k, v in json.loads(paper).items():
        print(f'{k}: {v} \n')
    break

id: 0704.0001 

submitter: Pavel Nadolsky 

authors: C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan 

title: Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies 

comments: 37 pages, 15 figures; published version 

journal-ref: Phys.Rev.D76:013009,2007 

doi: 10.1103/PhysRevD.76.013009 

report-no: ANL-HEP-PR-07-12 

categories: hep-ph 

license: None 

abstract:   A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
m

In [None]:
category_map = {'astro-ph': 'Astrophysics',
                'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
                'astro-ph.EP': 'Earth and Planetary Astrophysics',
                'astro-ph.GA': 'Astrophysics of Galaxies',
                'astro-ph.HE': 'High Energy Astrophysical Phenomena',
                'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
                'astro-ph.SR': 'Solar and Stellar Astrophysics',
                'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
                'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
                'cond-mat.mtrl-sci': 'Materials Science',
                'cond-mat.other': 'Other Condensed Matter',
                'cond-mat.quant-gas': 'Quantum Gases',
                'cond-mat.soft': 'Soft Condensed Matter',
                'cond-mat.stat-mech': 'Statistical Mechanics',
                'cond-mat.str-el': 'Strongly Correlated Electrons',
                'cond-mat.supr-con': 'Superconductivity',
                'cs.AI': 'Artificial Intelligence',
                'cs.AR': 'Hardware Architecture',
                'cs.CC': 'Computational Complexity',
                'cs.CE': 'Computational Engineering, Finance, and Science',
                'cs.CG': 'Computational Geometry',
                'cs.CL': 'Computation and Language',
                'cs.CR': 'Cryptography and Security',
                'cs.CV': 'Computer Vision and Pattern Recognition',
                'cs.CY': 'Computers and Society',
                'cs.DB': 'Databases',
                'cs.DC': 'Distributed, Parallel, and Cluster Computing',
                'cs.DL': 'Digital Libraries',
                'cs.DM': 'Discrete Mathematics',
                'cs.DS': 'Data Structures and Algorithms',
                'cs.ET': 'Emerging Technologies',
                'cs.FL': 'Formal Languages and Automata Theory',
                'cs.GL': 'General Literature',
                'cs.GR': 'Graphics',
                'cs.GT': 'Computer Science and Game Theory',
                'cs.HC': 'Human-Computer Interaction',
                'cs.IR': 'Information Retrieval',
                'cs.IT': 'Information Theory',
                'cs.LG': 'Machine Learning',
                'cs.LO': 'Logic in Computer Science',
                'cs.MA': 'Multiagent Systems',
                'cs.MM': 'Multimedia',
                'cs.MS': 'Mathematical Software',
                'cs.NA': 'Numerical Analysis',
                'cs.NE': 'Neural and Evolutionary Computing',
                'cs.NI': 'Networking and Internet Architecture',
                'cs.OH': 'Other Computer Science',
                'cs.OS': 'Operating Systems',
                'cs.PF': 'Performance',
                'cs.PL': 'Programming Languages',
                'cs.RO': 'Robotics',
                'cs.SC': 'Symbolic Computation',
                'cs.SD': 'Sound',
                'cs.SE': 'Software Engineering',
                'cs.SI': 'Social and Information Networks',
                'cs.SY': 'Systems and Control',
                'econ.EM': 'Econometrics',
                'eess.AS': 'Audio and Speech Processing',
                'eess.IV': 'Image and Video Processing',
                'eess.SP': 'Signal Processing',
                'gr-qc': 'General Relativity and Quantum Cosmology',
                'hep-ex': 'High Energy Physics - Experiment',
                'hep-lat': 'High Energy Physics - Lattice',
                'hep-ph': 'High Energy Physics - Phenomenology',
                'hep-th': 'High Energy Physics - Theory',
                'math.AC': 'Commutative Algebra',
                'math.AG': 'Algebraic Geometry',
                'math.AP': 'Analysis of PDEs',
                'math.AT': 'Algebraic Topology',
                'math.CA': 'Classical Analysis and ODEs',
                'math.CO': 'Combinatorics',
                'math.CT': 'Category Theory',
                'math.CV': 'Complex Variables',
                'math.DG': 'Differential Geometry',
                'math.DS': 'Dynamical Systems',
                'math.FA': 'Functional Analysis',
                'math.GM': 'General Mathematics',
                'math.GN': 'General Topology',
                'math.GR': 'Group Theory',
                'math.GT': 'Geometric Topology',
                'math.HO': 'History and Overview',
                'math.IT': 'Information Theory',
                'math.KT': 'K-Theory and Homology',
                'math.LO': 'Logic',
                'math.MG': 'Metric Geometry',
                'math.MP': 'Mathematical Physics',
                'math.NA': 'Numerical Analysis',
                'math.NT': 'Number Theory',
                'math.OA': 'Operator Algebras',
                'math.OC': 'Optimization and Control',
                'math.PR': 'Probability',
                'math.QA': 'Quantum Algebra',
                'math.RA': 'Rings and Algebras',
                'math.RT': 'Representation Theory',
                'math.SG': 'Symplectic Geometry',
                'math.SP': 'Spectral Theory',
                'math.ST': 'Statistics Theory',
                'math-ph': 'Mathematical Physics',
                'nlin.AO': 'Adaptation and Self-Organizing Systems',
                'nlin.CD': 'Chaotic Dynamics',
                'nlin.CG': 'Cellular Automata and Lattice Gases',
                'nlin.PS': 'Pattern Formation and Solitons',
                'nlin.SI': 'Exactly Solvable and Integrable Systems',
                'nucl-ex': 'Nuclear Experiment',
                'nucl-th': 'Nuclear Theory',
                'physics.acc-ph': 'Accelerator Physics',
                'physics.ao-ph': 'Atmospheric and Oceanic Physics',
                'physics.app-ph': 'Applied Physics',
                'physics.atm-clus': 'Atomic and Molecular Clusters',
                'physics.atom-ph': 'Atomic Physics',
                'physics.bio-ph': 'Biological Physics',
                'physics.chem-ph': 'Chemical Physics',
                'physics.class-ph': 'Classical Physics',
                'physics.comp-ph': 'Computational Physics',
                'physics.data-an': 'Data Analysis, Statistics and Probability',
                'physics.ed-ph': 'Physics Education',
                'physics.flu-dyn': 'Fluid Dynamics',
                'physics.gen-ph': 'General Physics',
                'physics.geo-ph': 'Geophysics',
                'physics.hist-ph': 'History and Philosophy of Physics',
                'physics.ins-det': 'Instrumentation and Detectors',
                'physics.med-ph': 'Medical Physics',
                'physics.optics': 'Optics',
                'physics.plasm-ph': 'Plasma Physics',
                'physics.pop-ph': 'Popular Physics',
                'physics.soc-ph': 'Physics and Society',
                'physics.space-ph': 'Space Physics',
                'q-bio.BM': 'Biomolecules',
                'q-bio.CB': 'Cell Behavior',
                'q-bio.GN': 'Genomics',
                'q-bio.MN': 'Molecular Networks',
                'q-bio.NC': 'Neurons and Cognition',
                'q-bio.OT': 'Other Quantitative Biology',
                'q-bio.PE': 'Populations and Evolution',
                'q-bio.QM': 'Quantitative Methods',
                'q-bio.SC': 'Subcellular Processes',
                'q-bio.TO': 'Tissues and Organs',
                'q-fin.CP': 'Computational Finance',
                'q-fin.EC': 'Economics',
                'q-fin.GN': 'General Finance',
                'q-fin.MF': 'Mathematical Finance',
                'q-fin.PM': 'Portfolio Management',
                'q-fin.PR': 'Pricing of Securities',
                'q-fin.RM': 'Risk Management',
                'q-fin.ST': 'Statistical Finance',
                'q-fin.TR': 'Trading and Market Microstructure',
                'quant-ph': 'Quantum Physics',
                'stat.AP': 'Applications',
                'stat.CO': 'Computation',
                'stat.ME': 'Methodology',
                'stat.ML': 'Machine Learning',
                'stat.OT': 'Other Statistics',
                'stat.TH': 'Statistics Theory'}

In [None]:
from tqdm.notebook import tqdm

titles = []
abstracts = []
categories = []
authors = []
journal = []
doi = []
id = []
update_date = []
versions = []

# Consider all categories in the `category_map` to be used during training and prediction
#paper_categories = np.array(list(category_map.keys())).flatten()
paper_categories = np.array(list(category_map.keys()))

metadata = get_metadata()
for paper in tqdm(metadata):
    paper_dict = json.loads(paper)
    category = paper_dict.get('categories')
    try:
      year = int(paper_dict.get('update_date')[:4])    ### Example Format: "Phys.Rev.D76:013009,2007"
        #except:
            #year = int(paper_dict.get('journal-ref')[-5:-1])    ### Example Format: "Phys.Rev.D76:013009,(2007)"

      if category in paper_categories and 1991<year<2010:
            id.append(paper_dict.get('id'))
            update_date.append(paper_dict.get('update_date'))
            authors.append(paper_dict.get('authors'))
            titles.append(paper_dict.get('title'))
            abstracts.append(paper_dict.get('abstract'))
            categories.append(paper_dict.get('categories'))
            journal.append(paper_dict.get('journal-ref'))
            versions.append(paper_dict.get('versions'))
            doi.append(paper_dict.get('doi'))
    except:
        pass 

len(id), len(authors), len(titles), len(abstracts), len(categories), len(journal), len(doi)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




(289917, 289917, 289917, 289917, 289917, 289917, 289917)

In [None]:
papers = pd.DataFrame({
    'id':id,
    'update_date': update_date,
    'author':authors,
    'title': titles,
    'abstract': abstracts,
    'categories': categories,
    'journal': journal,
    'versions': versions,
    'doi': doi
})

In [None]:
papers.head()

Unnamed: 0,id,update_date,author,title,abstract,categories,journal,versions,doi
0,704.0001,2008-11-26,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,"Phys.Rev.D76:013009,2007","[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",10.1103/PhysRevD.76.013009
1,704.0003,2008-01-13,Hongjun Pan,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",
2,704.0004,2007-05-23,David Callan,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",
3,704.0007,2008-11-26,"Alejandro Corichi, Tatjana Vukasinac and Jose ...",Polymer Quantum Mechanics and its Continuum Limit,A rather non-standard quantum representation...,gr-qc,"Phys.Rev.D76:044016,2007","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",10.1103/PhysRevD.76.044016
4,704.0008,2009-02-05,Damian C. Swift,Numerical solution of shock and ramp compressi...,A general formulation was developed to repre...,cond-mat.mtrl-sci,"Journal of Applied Physics, vol 104, 073536 (2...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",10.1063/1.2975338


In [None]:
papers['general_category'] = papers.categories.apply(lambda x: x.split('.')[0] )
papers.head()

Unnamed: 0,id,update_date,author,title,abstract,categories,journal,versions,doi,general_category
0,704.0001,2008-11-26,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,"Phys.Rev.D76:013009,2007","[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",10.1103/PhysRevD.76.013009,hep-ph
1,704.0003,2008-01-13,Hongjun Pan,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",,physics
2,704.0004,2007-05-23,David Callan,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",,math
3,704.0007,2008-11-26,"Alejandro Corichi, Tatjana Vukasinac and Jose ...",Polymer Quantum Mechanics and its Continuum Limit,A rather non-standard quantum representation...,gr-qc,"Phys.Rev.D76:044016,2007","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",10.1103/PhysRevD.76.044016,gr-qc
4,704.0008,2009-02-05,Damian C. Swift,Numerical solution of shock and ramp compressi...,A general formulation was developed to repre...,cond-mat.mtrl-sci,"Journal of Applied Physics, vol 104, 073536 (2...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",10.1063/1.2975338,cond-mat


In [None]:
import sklearn
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(papers, test_size=0.2,random_state = 10)

In [None]:
df_train.to_csv('train.csv', sep='\t')
df_test.to_csv('test.csv', sep='\t')
papers.to_csv('data.csv', sep='\t')

In [None]:
df2=pd.read_csv("data.csv", sep = "\t")
df2.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,id,update_date,author,title,abstract,categories,journal,versions,doi,general_category
0,0,704.0,2008-11-26,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,"Phys.Rev.D76:013009,2007","[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",10.1103/PhysRevD.76.013009,hep-ph
1,1,704.0,2008-01-13,Hongjun Pan,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",,physics
2,2,704.0,2007-05-23,David Callan,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",,math
3,3,704.001,2008-11-26,"Alejandro Corichi, Tatjana Vukasinac and Jose ...",Polymer Quantum Mechanics and its Continuum Limit,A rather non-standard quantum representation...,gr-qc,"Phys.Rev.D76:044016,2007","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",10.1103/PhysRevD.76.044016,gr-qc
4,4,704.001,2009-02-05,Damian C. Swift,Numerical solution of shock and ramp compressi...,A general formulation was developed to repre...,cond-mat.mtrl-sci,"Journal of Applied Physics, vol 104, 073536 (2...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",10.1063/1.2975338,cond-mat
