## Schemas

<https://www.kaggle.com/c/tmdb-box-office-prediction/data>

In [3]:
import pandas as pd

## Read csv data

In [16]:
films_sdf = spark.read.csv("data/tmdb-box-office-prediction/train.csv", header=True)

In [17]:
films_sdf.printSchema()

root
 |-- id: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- revenue: string (nullable = true)



In [18]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F

films_sdf = films_sdf\
    .withColumn("id", films_sdf.id.cast(IntegerType()))
    
    
films_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- revenue: string (nullable = true)



In [19]:
films_sdf.select("genres").show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+
|genres                                                                                                                        |
+------------------------------------------------------------------------------------------------------------------------------+
|[{'id': 35, 'name': 'Comedy'}]                                                                                                |
|[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}]|
|[{'id': 18, 'name': 'Drama'}]                                                                                                 |
|[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'name': 'Drama'}]                                                                 |
|[{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}]                                  

In [21]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

schema = ArrayType(
    StructType([
        StructField("id", IntegerType()),
        StructField("name", StringType())
    ])
)

films_sdf = films_sdf\
    .withColumn("genres", F.from_json(F.col("genres"), schema))

In [22]:
films_sdf.select("genres").show(5, truncate=False)

+--------------------------------------------------------------+
|genres                                                        |
+--------------------------------------------------------------+
|[[35, Comedy]]                                                |
|[[35, Comedy], [18, Drama], [10751, Family], [10749, Romance]]|
|[[18, Drama]]                                                 |
|[[53, Thriller], [18, Drama]]                                 |
|[[28, Action], [53, Thriller]]                                |
+--------------------------------------------------------------+
only showing top 5 rows



In [23]:
films_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- 

In [24]:
films_sdf.select("genres.name").show(5, truncate=False)

+--------------------------------+
|name                            |
+--------------------------------+
|[Comedy]                        |
|[Comedy, Drama, Family, Romance]|
|[Drama]                         |
|[Thriller, Drama]               |
|[Action, Thriller]              |
+--------------------------------+
only showing top 5 rows



In [25]:
films_sdf.select("genres.name").printSchema()

root
 |-- name: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [None]:
id

In [127]:
films_sdf.select("Keywords").show(5, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Keywords                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+-------------------------------------------------------------------------------------------------

## String manipulation

In [27]:
films_sdf2 = spark.read.csv("data/tmdb-box-office-prediction/train.csv", header=True)

In [28]:
genres_col = films_sdf2\
    .select("id", F.col("genres").substr(F.lit(2), F.length(F.col("genres"))-2).alias("genres"))
genres_col.show(3, truncate=False)

+---+----------------------------------------------------------------------------------------------------------------------------+
|id |genres                                                                                                                      |
+---+----------------------------------------------------------------------------------------------------------------------------+
|1  |{'id': 35, 'name': 'Comedy'}                                                                                                |
|2  |{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}|
|3  |{'id': 18, 'name': 'Drama'}                                                                                                 |
+---+----------------------------------------------------------------------------------------------------------------------------+
only showing top 3 rows



In [42]:
genre_sdf = genres_col\
    .select("id", F.explode(F.split(F.col("genres"), '(?<=}), ')).alias("genre"))

genre_sdf.show(truncate=False)

+---+---------------------------------+
|id |genre                            |
+---+---------------------------------+
|1  |{'id': 35, 'name': 'Comedy'}     |
|2  |{'id': 35, 'name': 'Comedy'}     |
|2  |{'id': 18, 'name': 'Drama'}      |
|2  |{'id': 10751, 'name': 'Family'}  |
|2  |{'id': 10749, 'name': 'Romance'} |
|3  |{'id': 18, 'name': 'Drama'}      |
|4  |{'id': 53, 'name': 'Thriller'}   |
|4  |{'id': 18, 'name': 'Drama'}      |
|5  |{'id': 28, 'name': 'Action'}     |
|5  |{'id': 53, 'name': 'Thriller'}   |
|6  |{'id': 16, 'name': 'Animation'}  |
|6  |{'id': 12, 'name': 'Adventure'}  |
|6  |{'id': 10751, 'name': 'Family'}  |
|7  |{'id': 27, 'name': 'Horror'}     |
|7  |{'id': 53, 'name': 'Thriller'}   |
|8  |{'id': 99, 'name': 'Documentary'}|
|9  |{'id': 28, 'name': 'Action'}     |
|9  |{'id': 35, 'name': 'Comedy'}     |
|9  |{'id': 10402, 'name': 'Music'}   |
|9  |{'id': 10751, 'name': 'Family'}  |
+---+---------------------------------+
only showing top 20 rows



In [43]:
genre_sdf = genre_sdf.select(
    F.col('id'),
    F.regexp_extract(F.col("genre"), "(?<='name':\ ')[A-Z][a-z]+", 0).alias("genre_name")
)
genre_sdf.show()

+---+-----------+
| id| genre_name|
+---+-----------+
|  1|     Comedy|
|  2|     Comedy|
|  2|      Drama|
|  2|     Family|
|  2|    Romance|
|  3|      Drama|
|  4|   Thriller|
|  4|      Drama|
|  5|     Action|
|  5|   Thriller|
|  6|  Animation|
|  6|  Adventure|
|  6|     Family|
|  7|     Horror|
|  7|   Thriller|
|  8|Documentary|
|  9|     Action|
|  9|     Comedy|
|  9|      Music|
|  9|     Family|
+---+-----------+
only showing top 20 rows



In [40]:
from pyspark.sql import Window

window = Window.partitionBy("id")




DataFrame[id: string, regexp_extract(genre, (?<='name':\ ')[A-Z][a-z]+, 0): string]

## Schema

In [18]:
from pyspark.sql.types import StructField, StructType, IntegerType, LongType

schema1 = StructType([
    StructField("id", LongType(), False),
])

films_sdf1 = spark.read.schema(schema=schema1).json("data/tmdb-box-office-prediction/train.json")
films_sdf1.printSchema()

root
 |-- id: long (nullable = true)



In [19]:
films_sdf1.head()

Row(id=1)

In [27]:
from pyspark.sql.types import StringType

schema2 = StructType([
    StructField("id", LongType()),
    StructField("genres", StringType())
])

films_sdf2 = spark.read.schema(schema=schema2).json("data/tmdb-box-office-prediction/train.json")
films_sdf2.printSchema()

root
 |-- id: long (nullable = true)
 |-- genres: string (nullable = true)



In [28]:
films_sdf2.show(truncate=False)

+---+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |genres                                                                                                                                                        |
+---+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1  |[{'id': 35, 'name': 'Comedy'}]                                                                                                                                |
|2  |[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}]                                |
|3  |[{'id': 18, 'name': 'Drama'}]                                                                                                                                 |
|4  |[{'id

In [None]:
import json

def parse_json(array_str):

json_obj = json.loads(array_str)

for item in json_obj:

yield (item["a"], item["b"])

In [35]:
from pyspark.sql.types import ArrayType

schema3 = StructType([
    StructField("id", IntegerType()),
    StructField("genres", ArrayType(
        StructType([
            StructField("id", StringType())
        ])
    ))
])
films_sdf3 = spark.read.schema(schema=schema3).csv("data/tmdb-box-office-prediction/train.csv")
films_sdf3.printSchema()

AnalysisException: 'CSV data source does not support array<struct<id:string>> data type.;'

In [31]:
films_sdf3.show()

+----+------+
|  id|genres|
+----+------+
|null|  null|
+----+------+

