In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import max
from pyspark.sql.functions import col
from pyspark.sql.types import StructType,StructField, StringType

# Creating the SparkSession
spark = SparkSession.builder.appName('Case_Study_Adidas').master("local").getOrCreate()

# 1. Load the data

In [2]:
# Load JSON file into dataframe

path = 'C:\\BigData\\ol_cdump.json'
data_df = spark.read.option("multiline", "false").option("inferSchema", "true").option("header", "true").json(path)

# Printing the Schema
# df.printSchema()

# 2. Make sure your data set is cleaned enough, so we for example don't include in results with empty/null "titles" and/or "number of pages" is greater than 20 and "publishing year" is after 1950. 

In [3]:
# Filtering the data

#df = data_df.filter(data_df.title.isNotNull()).filter(data_df.number_of_pages > '20').filter(data_df.publish_date > '1950')

df = data_df.filter((data_df.title.isNotNull()) & (data_df.number_of_pages > '20') & (data_df.publish_date > '1950'))

df.select("title","number_of_pages","publish_date").distinct().show(10,truncate=False)

+------------------------------------------------------------------------------------+---------------+----------------+
|title                                                                               |number_of_pages|publish_date    |
+------------------------------------------------------------------------------------+---------------+----------------+
|Hidden & Dangerous 2                                                                |240            |October 28, 2003|
|Facts of life and death                                                             |33             |1970            |
|The effect of two types of verbal hierarchy on problem solving                      |48             |1968            |
|American broadcasting                                                               |778            |1970            |
|Do you know your economic ABC's?                                                    |34             |1963            |
|The fundamentals of photoengraving     

# 3.1 Select all "Harry Potter" books

In [4]:
# Select all "Harry Potter" books

df.filter(col("title").contains("Harry Potter")).select("title","publish_date").show(truncate=False)

+----------------------------------------+-------------+
|title                                   |publish_date |
+----------------------------------------+-------------+
|Harry Potter and the philosopher's stone|1998         |
|Harry Potter y la piedra filosofal      |2000         |
|The Science of Harry Potter             |June 23, 2003|
+----------------------------------------+-------------+



# 3.2 Get the book with the most pages

In [5]:
# To Get the book with the most pages

df.createOrReplaceTempView("records")
spark.sql("select number_of_pages,title,publish_date from(select *, dense_rank() over(order by number_of_pages desc)r from records) where r=1").show(truncate=False)

+---------------+-----------------------------+------------+
|number_of_pages|title                        |publish_date|
+---------------+-----------------------------+------------+
|48418          |Nihon shokuminchi kenchikuron|2008        |
+---------------+-----------------------------+------------+



In [9]:
# To Get the book with the most pages(No of Pages)

df.groupby().max('number_of_pages').collect()[0].asDict()['max(number_of_pages)']

48418

# 3.3 Find the Top 5 authors with most written books (assuming author in first position in the array, "key" field and each row is a different book)

In [54]:
df.select("*").groupby("authors").max('number_of_pages').distinct().show(5,truncate=False)

+-----------------------------------+--------------------+
|authors                            |max(number_of_pages)|
+-----------------------------------+--------------------+
|[{null, /authors/OL4670954A, null}]|146                 |
|[{null, /authors/OL6536041A, null}]|650                 |
|[{null, /authors/OL6536108A, null}]|239                 |
|[{null, /authors/OL6286129A, null}]|160                 |
|[{null, /authors/OL1149330A, null}]|176                 |
+-----------------------------------+--------------------+
only showing top 5 rows



In [26]:
sql_df = spark.sql("select authors from(select *, dense_rank() over(order by number_of_pages desc)r from records) where r between 1 and 5")

print(sql_df.show(5, truncate=False))

+-----------------------------------+
|authors                            |
+-----------------------------------+
|[{null, /authors/OL5510271A, null}]|
|[{null, /authors/OL607566A, null}] |
|[{null, /authors/OL726653A, null}] |
|[{null, /authors/OL3455944A, null}]|
|[{null, /authors/OL3455944A, null}]|
+-----------------------------------+

None


# 3.4 Find the Top 5 genres with most books

In [35]:
# Find the Top 5 genres with most books

df_geners = df.filter(data_df.genres.isNotNull())

df_geners.select("genres").show(5, truncate=False)

+-------------------------------+
|genres                         |
+-------------------------------+
|[Early works to 1800]          |
|[Outlines, syllabi, etc]       |
|[Conversation and phrase books]|
|[Texts]                        |
|[Readers]                      |
+-------------------------------+
only showing top 5 rows



In [74]:
df_geners.createOrReplaceTempView("sample")

# spark.sql("select geners,count(*) from sample group by genres").show(5,truncate=False)

AnalysisException: cannot resolve '`geners`' given input columns: [sample.alternate_names, sample.authors, sample.bio, sample.birth_date, sample.by_statement, sample.contributions, sample.contributors, sample.copyright_date, sample.covers, sample.created, sample.death_date, sample.description, sample.dewey_decimal_class, sample.dewey_number, sample.download_url, sample.edition_name, sample.excerpts, sample.first_publish_date, sample.first_sentence, sample.full_title, sample.fuller_name, sample.genres, sample.ia_box_id, sample.ia_loaded_id, sample.identifiers, sample.isbn_10, sample.isbn_13, sample.isbn_invalid, sample.isbn_odd_length, sample.key, sample.languages, sample.last_modified, sample.latest_revision, sample.lc_classifications, sample.lccn, sample.links, sample.location, sample.name, sample.notes, sample.number_of_pages, sample.ocaid, sample.oclc_number, sample.oclc_numbers, sample.other_titles, sample.pagination, sample.personal_name, sample.photos, sample.physical_dimensions, sample.physical_format, sample.publish_country, sample.publish_date, sample.publish_places, sample.publishers, sample.purchase_url, sample.revision, sample.series, sample.source_records, sample.subject_people, sample.subject_place, sample.subject_places, sample.subject_time, sample.subject_times, sample.subjects, sample.subtitle, sample.table_of_contents, sample.title, sample.title_prefix, sample.type, sample.uri_descriptions, sample.uris, sample.url, sample.website, sample.weight, sample.work_title, sample.work_titles, sample.works]; line 1 pos 7;
'Aggregate [genres#28], ['geners, count(1) AS count(1)#5330L]
+- SubqueryAlias sample
   +- Filter isnotnull(genres#28)
      +- Filter ((isnotnull(title#72) AND (number_of_pages#46L > cast(20 as bigint))) AND (publish_date#57 > 1950))
         +- Relation[alternate_names#7,authors#8,bio#9,birth_date#10,by_statement#11,contributions#12,contributors#13,copyright_date#14,covers#15,created#16,death_date#17,description#18,dewey_decimal_class#19,dewey_number#20,download_url#21,edition_name#22,excerpts#23,first_publish_date#24,first_sentence#25,full_title#26,fuller_name#27,genres#28,ia_box_id#29,ia_loaded_id#30,... 52 more fields] json


In [33]:
df_geners.select("*").groupby("genres").sum().distinct().show(5,truncate=False)

+-------------------------------------------------+--------------------+--------------------+-------------+
|genres                                           |sum(latest_revision)|sum(number_of_pages)|sum(revision)|
+-------------------------------------------------+--------------------+--------------------+-------------+
|[Biography, Personal narratives, German]         |2                   |288                 |2            |
|[Catalogs., Guidebooks.]                         |2                   |136                 |2            |
|[Review.]                                        |2                   |253                 |2            |
|[Registers., Biography., Genealogy.]             |2                   |368                 |2            |
|[Directories, Juvenile literature., Directories.]|4                   |573                 |4            |
+-------------------------------------------------+--------------------+--------------------+-------------+
only showing top 5 rows



# 3.5 Get the avg. number of pages

In [55]:
# Get the avg. number of pages
df.groupby().avg('number_of_pages').collect()[0].asDict()['avg(number_of_pages)']

231.90327568877092

In [56]:
df.groupby().avg('number_of_pages').show()

+--------------------+
|avg(number_of_pages)|
+--------------------+
|  231.90327568877092|
+--------------------+



In [72]:
# Per publish year, get the number of authors that published at least one book

# publishers
df.createOrReplaceTempView("records")
spark.sql("select to_date(publish_date) as date from records").show(10)

AnalysisException: cannot resolve '`publish_date`' given input columns: [records.t]; line 1 pos 15;
'Project ['to_date('publish_date) AS date#5328]
+- SubqueryAlias records
   +- LogicalRDD [t#5237], false


In [67]:
spark.sql("select to_date('02-03-2013','MM-dd-yyyy') date") \
     .show()

+----------+
|      date|
+----------+
|2013-02-03|
+----------+



Py4JJavaError: An error occurred while calling o421.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 73.0 failed 1 times, most recent failure: Lost task 0.0 in stage 73.0 (TID 1099) (DESKTOP-LO74VMK executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
	at java.net.ServerSocket.implAccept(ServerSocket.java:545)
	at java.net.ServerSocket.accept(ServerSocket.java:513)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
	... 29 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2261)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:390)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3519)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3516)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
	at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
	at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
	at java.net.ServerSocket.implAccept(ServerSocket.java:545)
	at java.net.ServerSocket.accept(ServerSocket.java:513)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
	... 29 more
