In [12]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import max
from pyspark.sql.functions import col
from pyspark.sql.types import StructType,StructField, StringType

# Creating the SparkSession
spark = SparkSession.builder.appName('Case_Study_Adidas').master("local").getOrCreate()

# 1. Load the data

In [41]:
# Load JSON file into dataframe
path = 'C:\\BigData\\ol_cdump.json'
data_df = spark.read.option("multiline", "false").option("inferSchema", "true").option("header", "true").json(path)

# print(type(df))
# Printing the Schema
# df.printSchema()

# 2. Make sure your data set is cleaned enough, so we for example don't include in results with empty/null "titles".

In [49]:
# Filtering the data

df = data_df.filter(data_df.title.isNotNull()).filter(data_df.number_of_pages <'20').filter(data_df.publish_date <= '1990')

df.select("title","publish_date").show(10, truncate=False)

+---------------------------------------------------------------+------------+
|title                                                          |publish_date|
+---------------------------------------------------------------+------------+
|New directions for federal housing policy                      |1977        |
|Fine-line developer                                            |1975        |
|Effective listening and cognitive learning at the college level|1966        |
|A child's first total communication book                       |1974        |
|State employment opportunities for anthropologists             |1974        |
|Paintings by American masters                                  |1966        |
|The tale of the wee old woman                                  |1930        |
|Regents examinations in New York State after 100 years         |1965        |
|Picture books for creative thinking, a bibliography            |1974        |
|The exemption of minors from attendance            

In [None]:
df.filter((col("act_date") >= "2016-10-01") & (col("act_date") <= "2017-04-01"))

# 3.1 Select all "Harry Potter" books

In [4]:
# Select all "Harry Potter" books

df.filter(col("title").contains("Harry Potter")).select("title","publish_date").show(truncate=False)

+----------------------------------------+-------------+
|title                                   |publish_date |
+----------------------------------------+-------------+
|Harry Potter and the philosopher's stone|1998         |
|Harry Potter y la piedra filosofal      |2000         |
|The Science of Harry Potter             |June 23, 2003|
+----------------------------------------+-------------+



# 3.2 Get the book with the most pages

In [5]:
# To Get the book with the most pages

df.createOrReplaceTempView("records")
spark.sql("select number_of_pages,title,publish_date from(select *, dense_rank() over(order by number_of_pages desc)r from records) where r=1").show(truncate=False)

+---------------+-----------------------------+------------+
|number_of_pages|title                        |publish_date|
+---------------+-----------------------------+------------+
|48418          |Nihon shokuminchi kenchikuron|2008        |
+---------------+-----------------------------+------------+



In [6]:
# To Get the book with the most pages(No of Pages)

df.groupby().max('number_of_pages').collect()[0].asDict()['max(number_of_pages)']

48418

# 3.3 Find the Top 5 authors with most written books (assuming author in first position in the array, "key" field and each row is a different book)

In [52]:
# Find the Top 5 authors with most written books 
# (assuming author in first position in the array, "key" field and each row is a different book)

sql_df = spark.sql("select authors from(select *, dense_rank() over(order by number_of_pages desc)r from records) where r between 1 and 5")

print(sql_df.show(5, truncate=False))

+-----------------------------------+
|authors                            |
+-----------------------------------+
|[{null, /authors/OL5510271A, null}]|
|[{null, /authors/OL4442921A, null}]|
|[{null, /authors/OL607566A, null}] |
|[{null, /authors/OL130993A, null}] |
|[{null, /authors/OL726653A, null}] |
+-----------------------------------+

None


# 3.4 Find the Top 5 genres with most books

In [8]:
# Find the Top 5 genres with most books

df_geners = df.filter(data_df.genres.isNotNull())

df_geners.select("genres").show(3, truncate=False)
print(type(df_geners))

+------------------------+
|genres                  |
+------------------------+
|[Early works to 1800]   |
|[Outlines, syllabi, etc]|
|[Bibliography]          |
+------------------------+
only showing top 3 rows

<class 'pyspark.sql.dataframe.DataFrame'>


In [53]:
df_geners.createOrReplaceTempView("Sample1")

In [None]:

df_geners.groupby().max('genres').collect()[0].asDict()['max(genres)']

# 3.5 Get the avg. number of pages

In [None]:
# Get the avg. number of pages
df.groupby().avg('number_of_pages').collect()[0].asDict()['avg(number_of_pages)']

In [None]:
# Per publish year, get the number of authors that published at least one book
