In [1]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession,Row
from pyspark.sql.functions import max
from pyspark.sql.functions import col
from pyspark.sql.functions import *
from pyspark.sql.types import StructType,StructField, StringType

# Creating the SparkSession
spark = SparkSession.builder.appName('Case_Study_Adidas').master("local").getOrCreate()

# 1. Load the data

In [2]:
# Load JSON file into dataframe

path = 'C:\\BigData\\ol_cdump.json'
data_df = spark.read.option("multiline", "false").option("inferSchema", "true").option("header", "true").json(path)

# Printing the Schema
# df.printSchema()

# 2. Make sure your data set is cleaned enough, so we for example don't include in results with empty/null "titles" and/or "number of pages" is greater than 20 and "publishing year" is after 1950. 

In [3]:
# Filtering the data

# df = data_df.filter(data_df.title.isNotNull()).filter(data_df.number_of_pages > '20').filter(data_df.publish_date > '1950')

df = data_df.filter((data_df.title.isNotNull()) & (data_df.number_of_pages > '20') & (data_df.publish_date > '1950'))
df.select("title","number_of_pages","publish_date").distinct().show(10,truncate=False)

+------------------------------------------------------------------------------------+---------------+----------------+
|title                                                                               |number_of_pages|publish_date    |
+------------------------------------------------------------------------------------+---------------+----------------+
|Hidden & Dangerous 2                                                                |240            |October 28, 2003|
|Facts of life and death                                                             |33             |1970            |
|The effect of two types of verbal hierarchy on problem solving                      |48             |1968            |
|American broadcasting                                                               |778            |1970            |
|Do you know your economic ABC's?                                                    |34             |1963            |
|The fundamentals of photoengraving     

# 3.1 Select all "Harry Potter" books

In [4]:
# Select all "Harry Potter" books

df.select("*").filter(col("title").contains("Harry Potter")).select("title","publish_date").show(truncate=False)

# df.select("*").where(lower(df.title).contains("harry potter")).select("title","publish_date").show(5,False)
# df.select("*").where(upper(df.title).contains("HARRY POTTER")).select("title","publish_date").show(5,False)

+----------------------------------------+-------------+
|title                                   |publish_date |
+----------------------------------------+-------------+
|Harry Potter and the philosopher's stone|1998         |
|Harry Potter y la piedra filosofal      |2000         |
|The Science of Harry Potter             |June 23, 2003|
+----------------------------------------+-------------+



# 3.2 Get the book with the most pages

In [5]:
# To Get the book with the most pages

df.createOrReplaceTempView("records")
spark.sql("select number_of_pages,title,publish_date from(select *, dense_rank() over(order by number_of_pages desc)r from records) where r=1").show(truncate=False)

+---------------+-----------------------------+------------+
|number_of_pages|title                        |publish_date|
+---------------+-----------------------------+------------+
|48418          |Nihon shokuminchi kenchikuron|2008        |
+---------------+-----------------------------+------------+



In [6]:
# df.groupby().max('number_of_pages').collect()[0].asDict()['max(number_of_pages)']

pages = df.select(max('number_of_pages').alias("number"))
df.join(pages, df.number_of_pages == pages.number).select("number_of_pages","title").show(5,False)

+---------------+-----------------------------+
|number_of_pages|title                        |
+---------------+-----------------------------+
|48418          |Nihon shokuminchi kenchikuron|
+---------------+-----------------------------+



# 3.3 Find the Top 5 authors with most written books (assuming author in first position in the array, "key" field and each row is a different book)

In [7]:
author = df.selectExpr('explode(authors.key) as author_key').groupBy("author_key").count().sort(desc("count")).show(5,False)

+-------------------+-----+
|author_key         |count|
+-------------------+-----+
|/authors/OL1224818A|236  |
|/authors/OL4283462A|116  |
|/authors/OL785848A |106  |
|/authors/OL539875A |90   |
|/authors/OL1926829A|80   |
+-------------------+-----+
only showing top 5 rows



# 3.4 Find the Top 5 genres with most books

In [8]:
# Find the Top 5 genres with most books

df_geners = df.filter(data_df.genres.isNotNull())
genres = df_geners.select(explode(df_geners.genres).alias("explode_genres")).groupBy("explode_genres").count().sort(desc("count")).show(5,False)

+--------------------+-----+
|explode_genres      |count|
+--------------------+-----+
|Fiction.            |3302 |
|Biography.          |2360 |
|Juvenile literature.|1538 |
|Exhibitions.        |836  |
|Juvenile fiction.   |525  |
+--------------------+-----+
only showing top 5 rows



# 3.5 Get the avg. number of pages

In [9]:
# Get the avg. number of pages

df.groupby().avg('number_of_pages').collect()[0].asDict()['avg(number_of_pages)']
# df.groupby().avg('number_of_pages').show()

231.90327568877092

In [10]:
# Get the avg. number of pages
df.groupby().avg('number_of_pages').show()

+--------------------+
|avg(number_of_pages)|
+--------------------+
|  231.90327568877092|
+--------------------+



# 3.6 Per publish year, get the number of authors that published at least one book

In [11]:
# Per publish year, get the number of authors that published at least one book
df_pub_year = df.filter(df.publish_date.isNotNull()).filter(df.publish_date != '')

df_pub_year.createOrReplaceTempView("sample")
spark.sql("select year(publish_date) as publish_year,count(authors) from sample group by publish_year having count(authors) >= 1 order by publish_year asc ").show(80,False)

+------------+--------------+
|publish_year|count(authors)|
+------------+--------------+
|null        |2062          |
|1951        |676           |
|1952        |633           |
|1953        |622           |
|1954        |666           |
|1955        |611           |
|1956        |664           |
|1957        |740           |
|1958        |789           |
|1959        |879           |
|1960        |961           |
|1961        |1074          |
|1962        |1128          |
|1963        |1182          |
|1964        |1268          |
|1965        |1347          |
|1966        |1298          |
|1967        |1323          |
|1968        |1159          |
|1969        |1284          |
|1970        |1227          |
|1971        |1136          |
|1972        |1081          |
|1973        |1218          |
|1974        |1200          |
|1975        |1050          |
|1976        |962           |
|1977        |755           |
|1978        |789           |
|1979        |849           |
|1980     