# **Arxiv metadata Analytics with PySpark RDD: JSON case study**

In [1]:
########## ONLY in Colab ##########
!pip3 install pyspark
########## ONLY in Colab ##########

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=228170ca7d578d60a170f28ce925f42d12fd6e38f4195837267f260b954b3eb7
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
########## ONLY in Ubuntu Machine ##########
# Load Spark engine
!pip3 install -q findspark
import findspark
findspark.init()
########## ONLY in Ubuntu Machine ##########

In [4]:
# Initializing Spark
from pyspark import SparkContext, SparkConf

conf1 = SparkConf().setAppName("Archive_PySpark").setMaster("local[*]")

sc = SparkContext(conf = conf1)

print(sc)
print("hello world")

<SparkContext master=local[*] appName=Archive_PySpark>
hello world


In [5]:
import zipfile

!unzip /content/archive.zip

unzip:  cannot find or open /content/archive.zip, /content/archive.zip.zip or /content/archive.zip.ZIP.


In [6]:
# Read and Load Data to Spark
# Data source: https://www.kaggle.com/Cornell-University/arxiv/version/62

import json

rdd_json = sc.textFile("/content/drive/MyDrive/Colab Notebooks/arxiv-metadata-oai-snapshot.json", 100)
rdd = rdd_json.map(lambda x: json.loads(x))

#usually if we write an action, after running it will first check the transformation then run it which is not good for time processing,
#so Persist() save data in the memory with the transformation applied. (cache)

rdd.persist()

PythonRDD[2] at RDD at PythonRDD.scala:53

In [7]:
# Check the number of parallelism and partitions:


#cores
print(sc.defaultParallelism)

#partitions
#print(rdd.getNumPartitions())


2


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Question 1: Count elements

In [9]:
rdd.count()

2011231

## Question 2: Get the first two records


In [10]:
rdd.take(2)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

## Question 3: Get all attributes


In [None]:
rdd.flatMap(lambda x: x.keys()).distinct().collect()

['authors',
 'comments',
 'title',
 'id',
 'journal-ref',
 'versions',
 'submitter',
 'categories',
 'update_date',
 'authors_parsed',
 'report-no',
 'license',
 'abstract',
 'doi']

## Question 4: Get the name of the licenses

In [None]:
rdd.map(lambda x: x["license"]).distinct().collect()

[None,
 'http://creativecommons.org/licenses/publicdomain/',
 'http://creativecommons.org/licenses/by-nc-nd/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/3.0/',
 'http://creativecommons.org/licenses/by/3.0/',
 'http://creativecommons.org/licenses/by/4.0/',
 'http://creativecommons.org/publicdomain/zero/1.0/',
 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/',
 'http://creativecommons.org/licenses/by-sa/4.0/']

## Question 5: Get the shortest and the longest titles

In [None]:
shortest_title = rdd.map(lambda x: x["title"]).reduce(lambda x,y: x if x<y else y)
longest_title = rdd.map(lambda x: x["title"]).reduce(lambda x,y: x if x>y else y)


print("the shortest title : "+shortest_title)
print("the longest title : "+longest_title)


the shortest title : !-Graphs with Trivial Overlap are Context-Free
the longest title : Weyl formula for the negative dissipative eigenvalues of Maxwell's
  equations


## Question 6: Find abbreviations with 5 or more letters in the abstract

In [None]:
import re

def abbreviations(line):

  result = re.search(r"\(([A-Za-z][^_ /\\<>]{5,})\)", line)
  if result:
     return result.group(1) # return 1st match. group (0) will return all the matches


rdd.filter(lambda x: abbreviations(x["abstract"])).count()

192721

## Question 7: Get the number of archive records per month ('update_date' attribute)

In [None]:
import datetime

def extract_date(DateIn):
  #strptime : change a string to date format with certain parameters
  d= datetime.datetime.strptime(DateIn, "%Y-%m-%d")
  return d.month

date = "1999-04-16"
extract_date(date)

4

In [None]:
rdd.map(lambda x: (exctract_date(x["update_time"]), 1)).reduceByKey(lambda x,y: x+y).sortBy(lambda l: l[1]).collect()




## Question 8: Get the average number of pages

In [11]:
import re

In [19]:

def get_page(line):
  search= re.findall("\d+ pages", line)
  if search:
    return int(search[0].split(" ")[0]) #return for example "12 pages" then we take the first one by split " "
  else:
    return 0 #rows that have no pages


get_page("15 pages")





15

In [25]:
rdd_avg = rdd.map(lambda x: get_page(x["comments"] if x['comments'] != None else "None") )
#remove null values
new__rdd = rdd_avg.filter(lambda x:  x!=0)

count = new__rdd.count()

avg = new__rdd.reduce(lambda x,y: (x+y))

print(avg/count)


17.85319004286046
