In [3]:
import os
import sys
os.environ['SPARK_HOME']='D:/spark330hdp3sc3'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.9.5-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import numpy as np

In [5]:
spark = SparkSession.builder.appName('SparkTransformations') \
.config('spark.warehouse.dir','D:/tmp') \
.enableHiveSupport().getOrCreate()

In [6]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'3.3.0'

In [7]:
sc = spark.sparkContext

In [8]:
sc.version

'3.3.0'

In [9]:
mlist = [
    """{"title": "Dangal", "country": "India", "year": 2016,
"cast": {"Director": "Nitesh Tiwari", "LeadActor": "Aamir Khan", "BoxOfficeUSDMn": 330},
"genres": ["Biograhpy", "Drama"], "ratings": {"imdb": 8.4,"tomatoes": 4.55}}""", 
"""{"title": "Fight Club", "country": "USA", "year": 1999, 
"cast": {"Director": "David Fincher", "LeadActor": "Brad Pitt", "BoxOfficeUSDMn": 104}, 
"genres": ["Action", "Drama"], "ratings": {"imdb": 8.8,"tomatoes": 4.46}}"""
]

In [10]:
mrdd = sc.parallelize(mlist)
movieJSONDF = spark.read.json(mrdd)
movieJSONDF.show(2, False)

+--------------------------------+-------+------------------+-----------+----------+----+
|cast                            |country|genres            |ratings    |title     |year|
+--------------------------------+-------+------------------+-----------+----------+----+
|{330, Nitesh Tiwari, Aamir Khan}|India  |[Biograhpy, Drama]|{8.4, 4.55}|Dangal    |2016|
|{104, David Fincher, Brad Pitt} |USA    |[Action, Drama]   |{8.8, 4.46}|Fight Club|1999|
+--------------------------------+-------+------------------+-----------+----------+----+



In [12]:
cmplxSchema = StructType([
    StructField("title", StringType(), True),
    StructField("country", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("cast",
        StructType([
            StructField("Director", StringType(), True),
            StructField("LeadActor", StringType(), True),
            StructField("BoxOfficeUSDMn", DoubleType(), True)
        ]), True),
    StructField("genres", ArrayType(StringType(), True), True),
    StructField("ratings", MapType(StringType(), DoubleType()))])


In [13]:
movieJSONWithSchemaDF = spark.read.schema(cmplxSchema).json(mrdd)
movieJSONWithSchemaDF.show(2, False)
movieJSONWithSchemaDF.printSchema()

+----------+-------+----+----------------------------------+------------------+-------------------------------+
|title     |country|year|cast                              |genres            |ratings                        |
+----------+-------+----+----------------------------------+------------------+-------------------------------+
|Dangal    |India  |2016|{Nitesh Tiwari, Aamir Khan, 330.0}|[Biograhpy, Drama]|{imdb -> 8.4, tomatoes -> 4.55}|
|Fight Club|USA    |1999|{David Fincher, Brad Pitt, 104.0} |[Action, Drama]   |{imdb -> 8.8, tomatoes -> 4.46}|
+----------+-------+----+----------------------------------+------------------+-------------------------------+

root
 |-- title: string (nullable = true)
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- cast: struct (nullable = true)
 |    |-- Director: string (nullable = true)
 |    |-- LeadActor: string (nullable = true)
 |    |-- BoxOfficeUSDMn: double (nullable = true)
 |-- genres: array (nullable = 

In [18]:
# querying complex types
# arrays with indexing
movieJSONWithSchemaDF.select(col("genres")[0], col("genres")[1]).show()

+---------+---------+
|genres[0]|genres[1]|
+---------+---------+
|Biograhpy|    Drama|
|   Action|    Drama|
+---------+---------+



In [17]:
# structs using dot notation
movieJSONWithSchemaDF.select("cast.Director", "cast.LeadActor").show()

+-------------+----------+
|     Director| LeadActor|
+-------------+----------+
|Nitesh Tiwari|Aamir Khan|
|David Fincher| Brad Pitt|
+-------------+----------+



In [19]:
# maps using dot notations
movieJSONWithSchemaDF.select("ratings.imdb", "ratings.tomatoes").show()

+----+--------+
|imdb|tomatoes|
+----+--------+
| 8.4|    4.55|
| 8.8|    4.46|
+----+--------+



In [20]:
# maps using keys
movieJSONWithSchemaDF.select(col("ratings")["imdb"], col("ratings")["tomatoes"]).show()

+-------------+-----------------+
|ratings[imdb]|ratings[tomatoes]|
+-------------+-----------------+
|          8.4|             4.55|
|          8.8|             4.46|
+-------------+-----------------+



In [21]:
# filter using cmplex type part
movieJSONWithSchemaDF.filter("cast.LeadActor = 'Aamir Khan'").show(2, False)

+------+-------+----+----------------------------------+------------------+-------------------------------+
|title |country|year|cast                              |genres            |ratings                        |
+------+-------+----+----------------------------------+------------------+-------------------------------+
|Dangal|India  |2016|{Nitesh Tiwari, Aamir Khan, 330.0}|[Biograhpy, Drama]|{imdb -> 8.4, tomatoes -> 4.55}|
+------+-------+----+----------------------------------+------------------+-------------------------------+



In [22]:
# we can use the nested complex fields for grouping and aggregations
movieJSONWithSchemaDF.groupBy(col("cast.LeadActor")).agg(
    sum(col("ratings.imdb")).alias("imdbtotal"),
    sum(col("ratings.tomatoes")).alias("tomtot")).show()

+----------+---------+------+
| LeadActor|imdbtotal|tomtot|
+----------+---------+------+
|Aamir Khan|      8.4|  4.55|
| Brad Pitt|      8.8|  4.46|
+----------+---------+------+



In [23]:
# we can flatten the complex types using the explode function on arrays and maps
movieJSONWithSchemaDF.select(explode(col("genres"))).show()

+---------+
|      col|
+---------+
|Biograhpy|
|    Drama|
|   Action|
|    Drama|
+---------+



In [24]:
# in one select clause we can have only one expansion. the line below will create an error
# movieJSONWithSchemaDF.select(explode(col("genres")), explode(col("ratings"))).show()(2, False)

In [25]:
# for multiple expansions we will have to do them sequentially
movieJSONWithSchemaDF.select(explode(col("genres")), col("*")).select(explode(col("ratings")), col("*")).show(2, False)

+--------+-----+---------+------+-------+----+----------------------------------+------------------+-------------------------------+
|key     |value|col      |title |country|year|cast                              |genres            |ratings                        |
+--------+-----+---------+------+-------+----+----------------------------------+------------------+-------------------------------+
|imdb    |8.4  |Biograhpy|Dangal|India  |2016|{Nitesh Tiwari, Aamir Khan, 330.0}|[Biograhpy, Drama]|{imdb -> 8.4, tomatoes -> 4.55}|
|tomatoes|4.55 |Biograhpy|Dangal|India  |2016|{Nitesh Tiwari, Aamir Khan, 330.0}|[Biograhpy, Drama]|{imdb -> 8.4, tomatoes -> 4.55}|
+--------+-----+---------+------+-------+----+----------------------------------+------------------+-------------------------------+
only showing top 2 rows



In [27]:
# sql api for querying complex types
movieJSONWithSchemaDF.createOrReplaceTempView("mvcmplxtbl")
# struct parts using dot notation and genres exploded
spark.sql('''
select cast.director, cast.leadactor, ratings.imdb, ratings.tomatoes, explode(genres), *  
from mvcmplxtbl''').show(2, False)

+-------------+----------+----+--------+---------+------+-------+----+----------------------------------+------------------+-------------------------------+
|director     |leadactor |imdb|tomatoes|col      |title |country|year|cast                              |genres            |ratings                        |
+-------------+----------+----+--------+---------+------+-------+----+----------------------------------+------------------+-------------------------------+
|Nitesh Tiwari|Aamir Khan|8.4 |4.55    |Biograhpy|Dangal|India  |2016|{Nitesh Tiwari, Aamir Khan, 330.0}|[Biograhpy, Drama]|{imdb -> 8.4, tomatoes -> 4.55}|
|Nitesh Tiwari|Aamir Khan|8.4 |4.55    |Drama    |Dangal|India  |2016|{Nitesh Tiwari, Aamir Khan, 330.0}|[Biograhpy, Drama]|{imdb -> 8.4, tomatoes -> 4.55}|
+-------------+----------+----+--------+---------+------+-------+----+----------------------------------+------------------+-------------------------------+
only showing top 2 rows



In [28]:
spark.sql('''
          select cast.leadactor, sum(ratings.imdb) as imdbtot,
          sum(ratings.tomatoes) as tomtot 
          from mvcmplxtbl group by cast.leadactor
          ''').show(2, False)

+----------+-------+------+
|leadactor |imdbtot|tomtot|
+----------+-------+------+
|Aamir Khan|8.4    |4.55  |
|Brad Pitt |8.8    |4.46  |
+----------+-------+------+



In [29]:
 # we will use the get_json_object, from_json and to_json functions here to work with rdds of json strings
# we create mjsonDF as a dataframe of strings
mjsonDF = sc.parallelize(mlist).map(lambda x: (x,)).toDF(["mjson"])

In [30]:
# we can use get_json_object to navigate paths of a json string
mjsonDF.select(get_json_object(col("mjson"), "$.cast")).show(2, False)
mjsonDF.select(get_json_object(col("mjson"), "$.cast.Director")).show(2, False)

+--------------------------------------------------------------------------+
|get_json_object(mjson, $.cast)                                            |
+--------------------------------------------------------------------------+
|{"Director":"Nitesh Tiwari","LeadActor":"Aamir Khan","BoxOfficeUSDMn":330}|
|{"Director":"David Fincher","LeadActor":"Brad Pitt","BoxOfficeUSDMn":104} |
+--------------------------------------------------------------------------+

+---------------------------------------+
|get_json_object(mjson, $.cast.Director)|
+---------------------------------------+
|Nitesh Tiwari                          |
|David Fincher                          |
+---------------------------------------+



In [31]:
# we can use from_json along with a schema to load json and then use dot notation to 
# access any path of the generated json
mjsonDF.select(from_json(col("mjson"), cmplxSchema)).show(2, False)
mjsonDF.select(from_json(col("mjson"), cmplxSchema).alias("mdet")).show(2, False)
mjsonDF.select(from_json(col("mjson"), cmplxSchema).alias("mdet")).select("mdet.*").show(2, False)

+--------------------------------------------------------------------------------------------------------------+
|from_json(mjson)                                                                                              |
+--------------------------------------------------------------------------------------------------------------+
|{Dangal, India, 2016, {Nitesh Tiwari, Aamir Khan, 330.0}, [Biograhpy, Drama], {imdb -> 8.4, tomatoes -> 4.55}}|
|{Fight Club, USA, 1999, {David Fincher, Brad Pitt, 104.0}, [Action, Drama], {imdb -> 8.8, tomatoes -> 4.46}}  |
+--------------------------------------------------------------------------------------------------------------+

+--------------------------------------------------------------------------------------------------------------+
|mdet                                                                                                          |
+----------------------------------------------------------------------------------------------

In [32]:
# finally we can call to_json to generate json data from the data frame
# here we take the movieJSONWithSchemaDF we had created to get back the json strings
# from which we had created the dataframe
movieJSONWithSchemaDF.select(to_json(struct(col("*"))).alias("moviestring")).show(2, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|moviestring                                                                                                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"title":"Dangal","country":"India","year":2016,"cast":{"Director":"Nitesh Tiwari","LeadActor":"Aamir Khan","BoxOfficeUSDMn":330.0},"genres":["Biograhpy","Drama"],"ratings":{"imdb":8.4,"tomatoes":4.55}}|
|{"title":"Fight Club","country":"USA","year":1999,"cast":{"Director":"David Fincher","LeadActor":"Brad Pitt","BoxOfficeUSDMn":104.0},"genres":["Action","Drama"],"ratings":{"imdb":