In [1]:
import pandas as pd

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Pyspark course") \
    .getOrCreate()

In [2]:
films_sdf = spark\
    .read\
    .csv("data/tmdb-box-office-prediction/train.csv", header=True)

In [3]:
films_sdf.select("cast").show()

+--------------------+
|                cast|
+--------------------+
|Hot Tub Time Mach...|
|"[{'cast_id': 1, ...|
|"[{'cast_id': 5, ...|
|[{'cast_id': 1, '...|
|[{'cast_id': 3, '...|
|[{'cast_id': 6, '...|
|"[{'cast_id': 23,...|
|[{'cast_id': 2, '...|
|"[{'cast_id': 1, ...|
|Back together for...|
|         {'id': 2416|
|[{'cast_id': 2, '...|
|"[{'cast_id': 6, ...|
|"[{'cast_id': 4, ...|
|[{'cast_id': 5, '...|
|"[{'cast_id': 100...|
|"[{'cast_id': 1, ...|
|[{'cast_id': 7, '...|
|[{'cast_id': 2, '...|
|[{'cast_id': 1, '...|
+--------------------+
only showing top 20 rows



In [17]:
import pyspark.sql.functions as F

films_sdf_with_cast = films_sdf.filter(F.col("cast").rlike("\{"))

In [19]:
films_sdf_with_cast.select("cast").head()

Row(cast='"[{\'cast_id\': 1, \'character\': \'Mia Thermopolis\', \'credit_id\': \'52fe43fe9251416c7502561f\', \'gender\': 1, \'id\': 1813, \'name\': \'Anne Hathaway\', \'order\': 0, \'profile_path\': \'/jUMOKwSUBnTcMeN1HfhutiY49Ad.jpg\'}, {\'cast_id\': 2, \'character\': \'Queen Clarisse Renaldi\', \'credit_id\': \'52fe43fe9251416c75025623\', \'gender\': 1, \'id\': 5823, \'name\': \'Julie Andrews\', \'order\': 1, \'profile_path\': \'/6t61jkmfSA6nbYRCKR9s97CgUN6.jpg\'}, {\'cast_id\': 3, \'character\': \'Joe\', \'credit_id\': \'52fe43fe9251416c75025627\', \'gender\': 2, \'id\': 1210, \'name\': \'H√©ctor Elizondo\', \'order\': 2, \'profile_path\': \'/48UNfVFZVr0jyMIlLPhzm8IIM7f.jpg\'}, {\'cast_id\': 4, \'character\': \'Viscount Mabrey\', \'credit_id\': \'52fe43fe9251416c7502562b\', \'gender\': 2, \'id\': 655, \'name\': \'John Rhys-Davies\', \'order\': 3, \'profile_path\': \'/zZ67PuoFfik9QlZyfaEsFBC1yVJ.jpg\'}, {\'cast_id\': 5, \'character\': \'Lilly Moscovitz\', \'credit_id\': \'52fe43fe92

In [None]:
'cast_id'
'character'
'credit_id'
'gender'
'id'
'name'
'order'
'profile_path'

In [46]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

cast_schema = ArrayType(
    StructType([
        StructField("cast_id", IntegerType()),
        StructField("name", StringType())
    ])
)

films_sdf_with_cast_struct = films_sdf_with_cast\
    .withColumn("cast_struct", F.from_json(F.col("cast"), cast_schema))

In [83]:
films_sdf_with_cast_struct_bad = films_sdf_with_cast_struct\
    .filter(F.col("cast_struct").isNull())
films_sdf_with_cast_struct_good = films_sdf_with_cast_struct\
    .filter(F.col("cast_struct").isNotNull())

In [86]:
films_sdf_with_cast_struct_bad2 = \
    films_sdf_with_cast_struct_bad\
    .withColumn(
        "cast_no_comillas", 
        F.col("cast").substr(F.lit(2), F.length(F.col("cast"))-2))\
    .withColumn("cast_struct", F.from_json(F.col("cast_no_comillas"), cast_schema))\
    .drop("cast_no_comillas")

In [87]:
films_sdf_with_cast_struct_good = \
    films_sdf_with_cast_struct_bad2.union(films_sdf_with_cast_struct_good)

In [88]:
films_sdf_with_cast_struct_good.select("cast", "cast_struct").show()

+--------------------+--------------------+
|                cast|         cast_struct|
+--------------------+--------------------+
|"[{'cast_id': 1, ...|                null|
|"[{'cast_id': 5, ...|                null|
|[{'cast_id': 1, '...|  [[1, Vidya Balan]]|
|[{'cast_id': 6, '...| [[6, Scott Grimes]]|
|"[{'cast_id': 23,...|                null|
|[{'cast_id': 2, '...| [[2, Samir Khader]]|
|"[{'cast_id': 1, ...|                null|
|         {'id': 2416|                null|
|[{'cast_id': 2, '...|[[2, Robert Carra...|
|"[{'cast_id': 6, ...|                null|
|"[{'cast_id': 4, ...|                null|
|"[{'cast_id': 100...|                null|
|"[{'cast_id': 1, ...|                null|
|[{'cast_id': 7, '...|[[7, Ralph Fiennes]]|
|[{'cast_id': 2, '...|[[2, Felicity Jon...|
|[{'cast_id': 26, ...| [[26, Gary Oldman]]|
|"[{'cast_id': 1, ...|                null|
|"[{'cast_id': 1, ...|                null|
|[{'cast_id': 1037...|[[1037, Max Perli...|
|[{'cast_id': 9, '...|[[9, Natha

In [89]:
films_sdf_with_cast_struct_good.printSchema()

root
 |-- id: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- cast_struct: array (nullable

In [93]:
films_sdf_with_cast_field = films_sdf_with_cast_struct_good\
    .withColumn("cast_field", F.explode(F.col("cast_struct")))\
    .select("id", "cast_field")
films_sdf_with_cast_field.show(5)

+---+--------------------+
| id|          cast_field|
+---+--------------------+
|  4|    [1, Vidya Balan]|
|  6|   [6, Scott Grimes]|
|  8|   [2, Samir Khader]|
| 12|[2, Robert Carrad...|
| 18|  [7, Ralph Fiennes]|
+---+--------------------+
only showing top 5 rows



In [94]:
films_sdf_with_cast_field.printSchema()

root
 |-- id: string (nullable = true)
 |-- cast_field: struct (nullable = true)
 |    |-- cast_id: integer (nullable = true)
 |    |-- name: string (nullable = true)



In [97]:
films_sdf_with_cast_field.select("id", "cast_field.name")\
    .groupBy("name")\
    .count()\
    .orderBy(F.desc("count"))\
    .show()

+------------------+-----+
|              name|count|
+------------------+-----+
|           English|   24|
|    Susan Sarandon|   10|
| Samuel L. Jackson|   10|
|          Meg Ryan|    8|
|         Tom Hanks|    8|
|       John Cusack|    8|
|Sylvester Stallone|    8|
| Robert Downey Jr.|    8|
|      Cameron Diaz|    8|
|    Morgan Freeman|    8|
|      Willem Dafoe|    8|
| Denzel Washington|    7|
|   Richard Jenkins|    7|
|     Michael Caine|    7|
|        Mel Gibson|    6|
|      Ben Kingsley|    6|
|       Liam Neeson|    6|
|   Brendan Gleeson|    6|
|    Milla Jovovich|    6|
|  Antonio Banderas|    6|
+------------------+-----+
only showing top 20 rows



In [96]:
films_sdf_with_cast_field\
   .groupBy("name")

DataFrame[id: string, cast_field: struct<cast_id:int,name:string>]