In [11]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import max
from pyspark.sql.functions import col
from pyspark.sql.types import StructType,StructField, StringType

# Creating the SparkSession

spark = SparkSession.builder.appName('Case_Study_Adidas').master("local").getOrCreate()

In [12]:
# Load JSON file into dataframe

path = 'C:\\BigData\\ol_cdump.json'
data_df = spark.read.option("multiline", "false").json(path)

# print(type(df))
# Printing the Schema
# df.printSchema()

In [13]:
# Filtering the data where the title is null

df = data_df.filter(data_df.title.isNotNull())
df.select("title","publish_date").show(truncate=False)

+-----------------------------------------------------------------------------------------------------+------------+
|title                                                                                                |publish_date|
+-----------------------------------------------------------------------------------------------------+------------+
|The effect of differentiated marking tools and motivational treatment on figural creativity          |1975        |
|Comparison of the nominal grouping and sequenced brainstorming techniques of creative idea generation|1976        |
|Professional accident investigation                                                                  |1977        |
|I chauceriani scozzesi                                                                               |1964        |
|Lezioni zurighesi sul Petrarca                                                                       |1955        |
|El saber ginecologico del padre Feijoo                         

In [14]:
# Select all "Harry Potter" books

df.filter(col("title").contains("Harry Potter")).select("title","publish_date").show(truncate=False)

+----------------------------------------+-------------+
|title                                   |publish_date |
+----------------------------------------+-------------+
|Harry Potter and the philosopher's stone|1998         |
|Harry Potter y la piedra filosofal      |2000         |
|The Science of Harry Potter             |June 23, 2003|
+----------------------------------------+-------------+



In [15]:
# To Get the book with the most pages

df.createOrReplaceTempView("records")
spark.sql("select number_of_pages,title from(select *, dense_rank() over(order by number_of_pages desc)r from records) where r=1").show(truncate=False)

+---------------+-----------------------------+
|number_of_pages|title                        |
+---------------+-----------------------------+
|48418          |Nihon shokuminchi kenchikuron|
+---------------+-----------------------------+



In [8]:
# To Get the book with the most pages

df.groupby().max('number_of_pages').collect()[0].asDict()['max(number_of_pages)']

48418

In [18]:
# Find the Top 5 authors with most written books (assuming author in first position in the array, "key" field and each row is a different book)

sql_df =spark.sql("select authors,notes from(select *, dense_rank() over(order by number_of_pages desc)r from records) where r between 1 and 5")
print(sql_df.show(truncate=False))

+-----------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|authors                            |notes                                                                                                                     |
+-----------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|[{null, /authors/OL5510271A, null}]|{"type":"/type/text","value":"Includes bibliographical references and index."}                                            |
|[{null, /authors/OL4442921A, null}]|{"type":"/type/text","value":"'The second volume of my Rambles and researches among  Worcestershire churches' - preface."}|
|[{null, /authors/OL607566A, null}] |null                                                                                                                      |
|[{null, /authors/OL130993A, null}

In [None]:
# Find the Top 5 genres with most books

In [17]:
# Get the avg. number of pages
df.groupby().avg('number_of_pages').collect()[0].asDict()['avg(number_of_pages)']

222.65999078989668

In [10]:
# Per publish year, get the number of authors that published at least one book
