### This notebook performs an Exploratory Data Analysis on the data from the papers

In [1]:
# Import packages
from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.functions import lower, col, udf
import pyspark.sql.types as types

import spacy
nlp = spacy.load('en_core_web_sm')

import matplotlib as plt

### First we create a Spark Context  and a SQL Context

In [2]:
# checking if a previous Spark session is active
try:
    sc = SparkContext(appName="SDDM", master='local[*]')
    print("Created a SparkContext")
    sqlContext = SQLContext(sc)
    print("Created a SQLContext")
except ValueError:
    print("SparkContext already exists in this scope")

Created a SparkContext
Created a SQLContext


### Then we load the data into a SQLContext Dataframe

In [3]:
df = (sqlContext.read.format('csv').options(header='true').load('/data/s1847503/SDDM/data.csv'))

# Change all text to lowercase
for c in df.columns:
    if c is not 'paper_id':
        df = df.withColumn(c, lower(col(c)))
        
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|            paper_id|               title|        list_authors|            abstract|           full_text|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|d23c6a066a58dbe5e...|ly6e impairs coro...|['stephanie pfaen...|zoonotic coronavi...|"4 hepatoma cells...|
|618cd102ec5051a05...|mating strategy i...|['federica rosset...|adenoviruses are ...|across species, w...|
|ac81102667b0d56ed...|determination of ...|['v&apos;kovski p...|positive-sense rn...|"8 peptide of onl...|
|35ac99bfeb665ce0a...|φx174 attenuation...|['james t van leu...|natural selection...|"the unequal use ...|
|ad146e228bda4e5a3...|modelling the epi...|['marco claudio t...|                null|late december 201...|
|d418cdee18b07a0ad...|title: metagenomi...|['langelier', '; ...|commentary: lower...|"lower respirator...|
|1aa3e788fc6b03c14...|dark proteome o

### Then we analyze the distribution of the lengths of the papers

In [4]:
# Calculates the length of a paper in words
def text_length(text):
    return len(text.split(' '))

# Add a column with the paper lengths to the Dataframe
text_length_udf = udf( text_length, types.IntegerType() )
df_text = df.withColumn('text_length', text_length_udf(df.full_text)).select('paper_id', 'full_text', 'text_length')

# Keep only papers with a text length of greater than 10
# df_text = df_text.filter(df_text.text_length > 10)

print("The dataset consists of %d papers." % df_text.count())
print()
df_text.show()

The dataset consists of 29315 papers.

+--------------------+--------------------+-----------+
|            paper_id|           full_text|text_length|
+--------------------+--------------------+-----------+
|d23c6a066a58dbe5e...|"4 hepatoma cells...|        544|
|618cd102ec5051a05...|across species, w...|        832|
|ac81102667b0d56ed...|"8 peptide of onl...|        744|
|35ac99bfeb665ce0a...|"the unequal use ...|       3478|
|ad146e228bda4e5a3...|late december 201...|       1214|
|d418cdee18b07a0ad...|"lower respirator...|        767|
|1aa3e788fc6b03c14...|"world health org...|       7163|
|97e0efc17b5a10c75...|since december 20...|       2466|
|c954675ee859e2f7b...|"the recent outbr...|       1315|
|9896bc65559e6406d...|the rapid spread ...|       1150|
|c8559b6ddd6213bc9...|"accounting for s...|        209|
|73d80c8f5780d70bd...|coronaviruses (co...|       6638|
|b5b029e65b963ae09...|antibiotics are w...|       4212|
|6c91b00faa1614242...|in december 2019,...|        288|
|fcb76f09

In [7]:
# My idea is to create a boxplot with all paper lengths,
# so we can see how long the papers typically are.

lengths = df_text.select('text_length').collect().text_length

Py4JJavaError: An error occurred while calling o135.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 25 in stage 7.0 failed 1 times, most recent failure: Lost task 25.0 in stage 7.0 (TID 94, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-4-34fcb1a13392>", line 3, in text_length
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 142, in dump_stream
    for obj in iterator:
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 341, in _batched
    for item in iterator:
  File "<string>", line 1, in <lambda>
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in <lambda>
    return lambda *a: f(*a)
  File "/home/s1847503/miniconda3/envs/DLNN/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-4-34fcb1a13392>", line 3, in text_length
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


### Create a bag-of-words representation of the papers

In [6]:
# Returns a list of words in a given document while removing punctuation, whitespaces and stop words
def lemmas(doc):
    return [word.lemma_ for word in doc if word.is_alpha and not (word.is_punct or word.is_space or word.is_stop)]


# Returns the words in a given document in a bag of words notation 
def bag_of_words(paper):
    bag = {}
    doc = nlp(paper)
    for word in lemmas(doc):
        if word in bag:
            bag[word] += 1
        else:
            bag[word] = 1
    return bag
bag_of_words_udf = udf( bag_of_words, types.MapType(types.StringType(), types.IntegerType()) )

df_bags = df_text.withColumn('bag_of_words', bag_of_words_udf(df_text.full_text)).select('paper_id', 'bag_of_words')
df_bags.show()

+--------------------+--------------------+
|            paper_id|        bag_of_words|
+--------------------+--------------------+
|d23c6a066a58dbe5e...|[pp -> 1, copyrig...|
|618cd102ec5051a05...|[half -> 1, dropl...|
|ac81102667b0d56ed...|[cluster -> 3, gl...|
|35ac99bfeb665ce0a...|[extent -> 1, bad...|
|ad146e228bda4e5a3...|[equivalent -> 1,...|
|d418cdee18b07a0ad...|[spurious -> 1, r...|
|1aa3e788fc6b03c14...|[reuse -> 18, opt...|
|97e0efc17b5a10c75...|[markedly -> 1, r...|
|c954675ee859e2f7b...|[reason -> 1, sma...|
|9896bc65559e6406d...|[farth -> 1, zhen...|
|c8559b6ddd6213bc9...|[select -> 1, ede...|
|73d80c8f5780d70bd...|[exception -> 2, ...|
|b5b029e65b963ae09...|[alginate -> 1, e...|
|6c91b00faa1614242...|[copyright -> 1, ...|
|fcb76f0907f67a850...|[half -> 1, depen...|
|ffbd7555a33770623...|[unidentified -> ...|
|7852aafdfb9e59e6a...|[reuse -> 6, fit ...|
|7677310b4e43cfaff...|[year -> 1, covs ...|
|26cb6703ca72bf978...|[extent -> 1, fig...|
|4b5ba0d8c476c899a...|[cluster -

### Don't forget to close the Spark Context when you are done!

In [None]:
sc.stop()