In [69]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import json
import pprint

For this chapter, we use a JSON dump of information about the TV show Silicon Valley from TV Maze. The data is available for download from [https://api.tvmaze.com/singlesearch/shows?q=silicon%20valley&embed=episodes](https://api.tvmaze.com/singlesearch/shows?q=silicon%20valley&embed=episodes).

In [3]:
spark = SparkSession.builder.getOrCreate()

In [5]:
shows = spark.read.json('../../data/shows/shows-silicon-valley.json')

In [7]:
three_shows = spark.read.json('../../data/shows/shows-*.json', multiLine=True)

In [8]:
assert three_shows.count() == 3

In [9]:
shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [17]:
array_subset = shows.select('name', 'genres')

In [18]:
array_subset.show(1, False)

+--------------+--------+
|name          |genres  |
+--------------+--------+
|Silicon Valley|[Comedy]|
+--------------+--------+



In [19]:
array_subset = array_subset.select(
    'name',
    array_subset.genres[0].alias('dot_and_index'), 
    F.col('genres')[0].alias('col_and_index'),
    array_subset.genres.getItem(0).alias('dot_and_method'), 
    F.col('genres').getItem(0).alias('col_and_method'),
)

In [20]:
array_subset.show()

+--------------+-------------+-------------+--------------+--------------+
|          name|dot_and_index|col_and_index|dot_and_method|col_and_method|
+--------------+-------------+-------------+--------------+--------------+
|Silicon Valley|       Comedy|       Comedy|        Comedy|        Comedy|
+--------------+-------------+-------------+--------------+--------------+



In [28]:
array_subset_repeated = array_subset.select(
    'name',
    F.lit('Comedy').alias('one'),
    F.lit('Horror').alias('two'),
    F.lit('Drama').alias('three'),
    F.col('dot_and_index')
).select(
    'name',
    F.array('one', 'two', 'three').alias('Some_Genres'),
    F.array_repeat('dot_and_index', 5).alias('Repeated_Genres')
)

In [29]:
array_subset_repeated.show(1, False)

+--------------+-----------------------+----------------------------------------+
|name          |Some_Genres            |Repeated_Genres                         |
+--------------+-----------------------+----------------------------------------+
|Silicon Valley|[Comedy, Horror, Drama]|[Comedy, Comedy, Comedy, Comedy, Comedy]|
+--------------+-----------------------+----------------------------------------+



In [30]:
array_subset_repeated.select(
    'name', F.size('Some_Genres'), F.size('Repeated_Genres')
).show()

+--------------+-----------------+---------------------+
|          name|size(Some_Genres)|size(Repeated_Genres)|
+--------------+-----------------+---------------------+
|Silicon Valley|                3|                    5|
+--------------+-----------------+---------------------+



In [31]:
array_subset_repeated = array_subset_repeated.select(
    'name',
    F.array_intersect('Some_Genres', 'Repeated_Genres').alias(
        'Genres'
    )
)

In [32]:
array_subset_repeated.show()

+--------------+--------+
|          name|  Genres|
+--------------+--------+
|Silicon Valley|[Comedy]|
+--------------+--------+



In [34]:
array_subset_repeated.select(
    'Genres', F.array_position('Genres', 'Comedy')
).show()

+--------+------------------------------+
|  Genres|array_position(Genres, Comedy)|
+--------+------------------------------+
|[Comedy]|                             1|
+--------+------------------------------+



In [51]:
columns = ['name', 'language', 'type']

shows_map = shows.select(
    *[F.lit(column) for column in columns],
    F.array(*columns).alias('values')
).select(
    F.array(*columns).alias('keys'), 'values'
).select(
    F.map_from_arrays('keys', 'values').alias('mapped')
)

In [53]:
shows_map.printSchema()

root
 |-- mapped: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [54]:
shows_map.select(
    F.col('mapped.name'), 
    F.col('mapped')['name'], 
    shows_map.mapped['name'], 
).show()

+--------------+--------------+--------------+
|          name|  mapped[name]|  mapped[name]|
+--------------+--------------+--------------+
|Silicon Valley|Silicon Valley|Silicon Valley|
+--------------+--------------+--------------+



We can see that the `_embedded` column is a useless struct as it only contains one field. We can create a new top-level `episodes` column.

In [57]:
shows_clean = shows.withColumn(
    'episodes', F.col('_embedded.episodes')
).drop('_embedded')

In [58]:
shows_clean.printSchema()

root
 |-- _links: struct (nullable = true)
 |    |-- previousepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- self: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |-- externals: struct (nullable = true)
 |    |-- imdb: string (nullable = true)
 |    |-- thetvdb: long (nullable = true)
 |    |-- tvrage: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: long (nullable = true)
 |-- image: struct (nullable = true)
 |    |-- medium: string (nullable = true)
 |    |-- original: string (nullable = true)
 |-- language: string (nullable = true)
 |-- name: string (nullable = true)
 |-- network: struct (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- code: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- timezone: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nul

In [64]:
# Since we have multiple records in the episodes array, episodes.name 
# extracts the name field or each record in the array and packs it into an array
# of names, which we then explode to show more clearly 
episodes_name = (
    shows_clean
    .select(F.col('episodes.name'))
    .select(F.explode('name').alias('name'))
)
episodes_name.show(3, False)

+-------------------------+
|name                     |
+-------------------------+
|Minimum Viable Product   |
|The Cap Table            |
|Articles of Incorporation|
+-------------------------+
only showing top 3 rows



We can build the schema for our `shows` DataFrame from scratch using PySpark data types.

In [66]:
episode_links_schema = T.StructType(
    [
        T.StructField(
            'self', T.StructType([T.StructField('href', T.StringType())])
        )
    ]
) 

episode_image_schema = T.StructType(
    [
        T.StructField('medium', T.StringType()), 
        T.StructField('original', T.StringType()) 
    ]
) 

episode_schema = T.StructType(
    [
        T.StructField('_links', episode_links_schema), 
        T.StructField('airdate', T.DateType()),
        T.StructField('airstamp', T.TimestampType()),
        T.StructField('airtime', T.StringType()),
        T.StructField('id', T.StringType()),
        T.StructField('image', episode_image_schema), 
        T.StructField('name', T.StringType()),
        T.StructField('number', T.LongType()),
        T.StructField('runtime', T.LongType()),
        T.StructField('season', T.LongType()),
        T.StructField('summary', T.StringType()),
        T.StructField('url', T.StringType())
    ]
)

embedded_schema = T.StructType(
    [
        T.StructField(
            '_embedded',
            T.StructType(
                [
                    T.StructField(
                        'episodes', T.ArrayType(episode_schema) 
                    )
                ]
            )
        )
    ]
)

We can now read in our show json enforcing while enforcing a more precise schema. This comes with a performance boost, because `inferSchema` requires a pre-read of the data just to infer the schema.

Because we only pass a partial schema, PySpark will only read the defined columns.

In [67]:
# By selecting FAILFAST mode, our DataFrameReader will crash
# if our schema is incompatible
shows_with_schema = spark.read.json(
    '../../data/shows/shows-silicon-valley.json',
    schema=embedded_schema,
    mode='FAILFAST'
)

In [68]:
for column in ['airdate', 'airstamp']:
    shows.select(f'_embedded.episodes.{column}').select(
        F.explode(column)
 ).show(5)

+----------+
|       col|
+----------+
|2014-04-06|
|2014-04-13|
|2014-04-20|
|2014-04-27|
|2014-05-04|
+----------+
only showing top 5 rows

+--------------------+
|                 col|
+--------------------+
|2014-04-07T02:00:...|
|2014-04-14T02:00:...|
|2014-04-21T02:00:...|
|2014-04-28T02:00:...|
|2014-05-05T02:00:...|
+--------------------+
only showing top 5 rows



In [70]:
pprint.pprint(
    shows_with_schema.select(
        F.explode('_embedded.episodes').alias('episode')
    )
    .select('episode.airtime')
    .schema.jsonValue()
)

{'fields': [{'metadata': {},
             'name': 'airtime',
             'nullable': True,
             'type': 'string'}],
 'type': 'struct'}


`schema` comes with a `json()` method which outputs a string containing the JSON-formatted schema. We can show that the JSON-schema is consistent with the one currently being used:

In [74]:
other_shows_schema = T.StructType.fromJson(
    json.loads(shows_with_schema.schema.json())
)
print(other_shows_schema == shows_with_schema.schema)

True


In [76]:
# Explode _embedded.episodes into distinct records
episodes = shows.select(
    'id', F.explode('_embedded.episodes').alias('episodes')
)
episodes.show(5, truncate=70)
episodes.count()

+---+----------------------------------------------------------------------+
| id|                                                              episodes|
+---+----------------------------------------------------------------------+
|143|{{{http://api.tvmaze.com/episodes/10897}}, 2014-04-06, 2014-04-07T0...|
|143|{{{http://api.tvmaze.com/episodes/10898}}, 2014-04-13, 2014-04-14T0...|
|143|{{{http://api.tvmaze.com/episodes/10899}}, 2014-04-20, 2014-04-21T0...|
|143|{{{http://api.tvmaze.com/episodes/10900}}, 2014-04-27, 2014-04-28T0...|
|143|{{{http://api.tvmaze.com/episodes/10901}}, 2014-05-04, 2014-05-05T0...|
+---+----------------------------------------------------------------------+
only showing top 5 rows



53

In [84]:
# Build a map from two arrays (key and value), then explode the map into three columns:
# the position, the key, and the value. Skips any null values in the map.
episode_name_id = shows.select(
    F.posexplode(
        F.map_from_arrays(
            F.col("_embedded.episodes.id"),
            F.col("_embedded.episodes.name")
        )
    ).alias("position", "id", "name")
)

In [85]:
episode_name_id.show(1, False)

+--------+-----+----------------------+
|position|id   |name                  |
+--------+-----+----------------------+
|0       |10897|Minimum Viable Product|
+--------+-----+----------------------+
only showing top 1 row



In [88]:
# Collect episodes back into an array
collected = episodes.groupby('id').agg(
    F.collect_list('episodes').alias('episodes')
)

collected.show()
collected.count()
collected.printSchema()

+---+--------------------+
| id|            episodes|
+---+--------------------+
|143|[{{{http://api.tv...|
+---+--------------------+

root
 |-- id: long (nullable = true)
 |-- episodes: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |-- airdate: string (nullable = true)
 |    |    |-- airstamp: string (nullable = true)
 |    |    |-- airtime: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- image: struct (nullable = true)
 |    |    |    |-- medium: string (nullable = true)
 |    |    |    |-- original: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- runtime: long (nullable = true)
 |    |    |-- season: long (nullable = true)
 |    |    |-- summary: string (nullable = true)
 |  

In [93]:
# Create a struct column using the struct function to how a few columns
# from the shows DataFrame
struct_ex = shows.select(
    F.struct(
        F.col('status'), F.col('weight'), F.lit(True).alias('has_watched')
    ).alias('info')
)

struct_ex.show(1, False)
struct_ex.printSchema()

+-----------------+
|info             |
+-----------------+
|{Ended, 96, true}|
+-----------------+

root
 |-- info: struct (nullable = false)
 |    |-- status: string (nullable = true)
 |    |-- weight: long (nullable = true)
 |    |-- has_watched: boolean (nullable = false)

