In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import json
import pprint

In [4]:
spark = SparkSession.builder.getOrCreate()

### Exercise 6.4

Why is it a bad idea to use the period or the square bracket in a column name, given
that you also use it to reach hierarchical entities within a data frame?

If you have a column named "person.name", PySpark interprets it as a nested field within the "person" struct. Therefore, using a period in column names can lead to ambiguity and confusion when accessing the columns or performing operations.

PySpark uses square brackets to access elements within an array or to specify a range of positions for column selection or filtering. If you have a column named "my_column[0]", PySpark interprets it as accessing the first element of the "my_column" array. Similarly, if you have a column named "my_column[0:5]", PySpark interprets it as selecting a range of elements from the "my_column" array. Thus, using square brackets in column names can lead to conflicts or unexpected behavior when using these operations.

Exercise 6.5

Although much less common, you can create a data frame from a dictionary. Since
dictionaries are so close to JSON documents, build the schema for ingesting the following dictionary. (Both JSON or PySpark schemas are valid here.)

```
dict_schema = ???
spark.createDataFrame([{"one": 1, "two": [1,2,3]}], schema=dict_schema)
```



In [6]:
dict_schema = T.StructType([
    T.StructField('one', T.IntegerType()),
    T.StructField('two', T.ArrayType(T.IntegerType()))
])

ex_6_5_frame = spark.createDataFrame([{"one": 1, "two": [1,2,3]}], schema=dict_schema)
ex_6_5_frame.printSchema()

root
 |-- one: integer (nullable = true)
 |-- two: array (nullable = true)
 |    |-- element: integer (containsNull = true)



### Exercise 6.6

Using three_shows, compute the time between the first and last episodes for each
show. Which show had the longest tenure?

In [7]:
three_shows = spark.read.json('../../data/shows/shows-*.json', multiLine=True)

In [8]:
three_shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [21]:
(
    three_shows.select(
        'name',
        (F.array_max('_embedded.episodes.airdate').cast('date') - F.array_min('_embedded.episodes.airdate').cast('date')).alias('tenure')
    )
    .show()
)

+----------------+-------------------+
|            name|             tenure|
+----------------+-------------------+
|The Golden Girls|INTERVAL '2429' DAY|
|    Breaking Bad|INTERVAL '2079' DAY|
|  Silicon Valley|INTERVAL '2072' DAY|
+----------------+-------------------+



The Golden Girls had the longest tenure of 2429 days.

### Exercise 6.7

Take the shows data frame and extract the air date and name of each episode in two
array columns.

In [22]:
shows = spark.read.json('../../data/shows/shows-silicon-valley.json')

In [27]:
shows.select(
    '_embedded.episodes.name',
    '_embedded.episodes.airdate'
).show()

+--------------------+--------------------+
|                name|             airdate|
+--------------------+--------------------+
|[Minimum Viable P...|[2014-04-06, 2014...|
+--------------------+--------------------+



### Exercise 6.8

Given the following data frame, create a new data frame that contains a single map
from one to square:

```
exo6_8 = spark.createDataFrame([[1, 2], [2, 4], [3, 9]], ["one", "square"])
```

In [33]:
exo6_8 = spark.createDataFrame([[1, 2], [2, 4], [3, 9]], ['one', 'square'])

In [35]:
exo6_8.printSchema()

root
 |-- one: long (nullable = true)
 |-- square: long (nullable = true)



In [None]:
(
    exo6_8
    .groupby()
    .agg(
        F.collect_list('one').alias('one'),
        F.collect_list('square').alias('square')
    )
    .select(F.map_from_arrays('one', 'square'))
)