# Data Types

In [1]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

from delta import *
import pandas as pd

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

spark



:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/siladitya/.ivy2/cache
The jars for the packages stored in: /home/siladitya/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-df4a3c30-57ad-47ae-9038-5bfc85ba4d8e;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.2.1 in central
	found io.delta#delta-storage;1.2.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 467ms :: artifacts dl 8ms
	:: modules in use:
	io.delta#delta-core_2.12;1.2.1 from central in [default]
	io.delta#delta-storage;1.2.1 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnld

In [2]:
# so that Pandas df display all columns, default is 20
pd.set_option('display.max_columns', None)

In [3]:
%load_ext sparksql_magic

In [4]:

# pretty display function
def display(spark_df, rows=10):
    return spark_df.limit(rows).toPandas().head(rows)

In [5]:
raw_data_path = '/home/siladitya/Documents/Spark/Spark-Course/Data/imdb/'

title_basics_schema = StructType([
                            StructField('tconst', StringType(), False),
                            StructField('titleType', StringType(), True),
                            StructField('primaryTitle', StringType(), True),
                            StructField('originalTitle', StringType(), True),
                            StructField('isAdult', IntegerType(), True),
                            StructField('startYear', IntegerType(), True),
                            StructField('endYear', IntegerType(), True),
                            StructField('runtimeMinutes', IntegerType(), True),
                            StructField('genres', StringType(), True)])

title_basics_sdf = (spark
                    .read
                    .schema(title_basics_schema)
                    .option('header', 'true')
                    .option('delimiter', '\t')
                    .csv(raw_data_path + 'title_basics.tsv'))

title_basics_sdf.createOrReplaceTempView('titles')

display(title_basics_sdf)

                                                                                

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,,1,"Documentary,Short"
8,tt0000009,short,Miss Jerry,Miss Jerry,0,1894,,40,"Romance,Short"
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,,1,"Documentary,Short"


In [6]:
title_basics_sdf.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- endYear: integer (nullable = true)
 |-- runtimeMinutes: integer (nullable = true)
 |-- genres: string (nullable = true)



### Booleans
Filtering statements.

In [7]:
nineteenth_century = col('startYear') < 1900

display(
    title_basics_sdf
        .where(nineteenth_century)
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,,1,"Documentary,Short"
8,tt0000009,short,Miss Jerry,Miss Jerry,0,1894,,40,"Romance,Short"
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,,1,"Documentary,Short"


In [8]:
display(
    title_basics_sdf
        .where('startYear < 1900') # didn't use col()
        .where(instr(col('genres'), 'Animation') >= 1)
)

                                                                                

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5.0,"Animation,Short"
1,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4.0,"Animation,Comedy,Romance"
2,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12.0,"Animation,Short"
3,tt0000015,short,Autour d'une cabine,Autour d'une cabine,0,1894,,2.0,"Animation,Short"
4,tt0000233,short,Choque de dos transatlánticos,Choque de dos transatlánticos,0,1899,,2.0,"Animation,Short"
5,tt0000251,short,Matches: An Appeal,Matches: An Appeal,0,1899,,1.0,"Animation,Short"
6,tt0000704,short,The Humpty Dumpty Circus,The Humpty Dumpty Circus,0,1898,,,"Animation,Comedy,Short"
7,tt0337350,short,Two Tipsy Fellows in a Boat,Two Tipsy Fellows in a Boat,0,1898,,,"Animation,Comedy,Short"
8,tt12592084,short,Le singe musicien,Le singe musicien,0,1878,,1.0,"Animation,Short"
9,tt13125956,short,Autour d'une cabine ou Mésaventures d'un copur...,Autour d'une cabine ou Mésaventures d'un copur...,0,1894,,5.0,"Animation,Short"


In [9]:
%%sparksql
select * from titles
where startYear < 1900 and instr(genres, 'Animation') >= 1
limit 10

                                                                                

0,1,2,3,4,5,6,7,8
tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
tt0000015,short,Autour d'une cabine,Autour d'une cabine,0,1894,,2,"Animation,Short"
tt0000233,short,Choque de dos transatlánticos,Choque de dos transatlánticos,0,1899,,2,"Animation,Short"
tt0000251,short,Matches: An Appeal,Matches: An Appeal,0,1899,,1,"Animation,Short"
tt0000704,short,The Humpty Dumpty Circus,The Humpty Dumpty Circus,0,1898,,,"Animation,Comedy,Short"
tt0337350,short,Two Tipsy Fellows in a Boat,Two Tipsy Fellows in a Boat,0,1898,,,"Animation,Comedy,Short"
tt12592084,short,Le singe musicien,Le singe musicien,0,1878,,1,"Animation,Short"


In [10]:
# null-safe comparison
display(
    title_basics_sdf
.   where(col('endYear').eqNullSafe(2000))
)

                                                                                

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0087305,tvSeries,The Power of Desire,Força de Um Desejo,0,1999,2000,45.0,"Drama,Romance"
1,tt0090405,tvSeries,The Bugs Bunny and Tweety Show,The Bugs Bunny and Tweety Show,0,1986,2000,,"Animation,Comedy,Family"
2,tt0092379,tvSeries,Inspector Morse,Inspector Morse,0,1987,2000,100.0,"Crime,Drama,Mystery"
3,tt0098749,tvSeries,"Beverly Hills, 90210","Beverly Hills, 90210",0,1990,2000,44.0,"Drama,Romance"
4,tt0103352,tvSeries,Are You Afraid of the Dark?,Are You Afraid of the Dark?,0,1990,2000,30.0,"Drama,Fantasy,Horror"
5,tt0103375,tvSeries,Both Sides with Jesse Jackson,Both Sides with Jesse Jackson,0,1992,2000,30.0,Talk-Show
6,tt0103556,tvSeries,Stadtklinik,Stadtklinik,0,1993,2000,74.0,Drama
7,tt0105958,tvSeries,Boy Meets World,Boy Meets World,0,1993,2000,23.0,"Comedy,Drama,Family"
8,tt0106123,tvSeries,Saved by the Bell: The New Class,Saved by the Bell: The New Class,0,1993,2000,30.0,"Comedy,Family"
9,tt0108724,tvSeries,Chicago Hope,Chicago Hope,0,1994,2000,60.0,"Drama,Mystery,Thriller"


### Numbers

In [11]:
# Pearson correlation
display(
    title_basics_sdf
        .select(corr('startYear', 'runtimeMinutes'))
)

                                                                                

Unnamed: 0,"corr(startYear, runtimeMinutes)"
0,-0.056034


In [12]:
display(
    title_basics_sdf
        .select('startYear', 'genres')
        .describe()
)

                                                                                

Unnamed: 0,summary,startYear,genres
0,count,7696751.0,8880191
1,mean,2004.242026148436,
2,stddev,20.333056646472357,
3,min,1874.0,Action
4,max,2029.0,\N


In [13]:
(title_basics_sdf
    .approxQuantile('runtimeMinutes',
                    probabilities= [0.5],
                    relativeError= 0.05))

                                                                                

[30.0]

In [14]:
display(
    title_basics_sdf
        # .limit(10000)
        .crosstab('genres', 'isAdult')
)

                                                                                

Unnamed: 0,genres_isAdult,0,1,1981,2005,2014,2017,2019,2020,null
0,"Action,Adult,Romance",0,8,0,0,0,0,0,0,0
1,"Documentary,Horror,News",54,0,0,0,0,0,0,0,0
2,"Action,Music,Western",17,0,0,0,0,0,0,0,0
3,"Animation,Family,Fantasy",4620,0,0,0,0,0,0,0,0
4,"Crime,Musical,Romance",16,0,0,0,0,0,0,0,0
5,"Action,Drama,News",6,0,0,0,0,0,0,0,0
6,"Romance,Talk-Show",30,0,0,0,0,0,0,0,0
7,"Biography,News,Sci-Fi",1,0,0,0,0,0,0,0,0
8,"Drama,Musical,Western",8,0,0,0,0,0,0,0,0
9,"Family,Romance",1459,0,0,0,0,0,0,0,0


In [15]:
display(
    title_basics_sdf
        # .limit(1000)
        .freqItems(['genres', 'isAdult'])
)

                                                                                

Unnamed: 0,genres_freqItems,isAdult_freqItems
0,"[Action, News,Talk-Show, Action,Adventure,Come...","[2017, 2020, 2005, 2014, 1981, 1, 2019, None, 0]"


### Strings

In [16]:
# character-level replacement
# 'a' -> '4'
# 'e' -> '3'
# 's' -> '5'
# 't' -> ''
display(
    title_basics_sdf
        .select(translate('primaryTitle', 'aest', '435'), 'primaryTitle')
)

Unnamed: 0,"translate(primaryTitle, aest, 435)",primaryTitle
0,C4rm3nci4,Carmencita
1,L3 clown 3 535 chi3n5,Le clown et ses chiens
2,P4uvr3 Pi3rro,Pauvre Pierrot
3,Un bon bock,Un bon bock
4,Bl4ck5mih Sc3n3,Blacksmith Scene
5,Chin353 Opium D3n,Chinese Opium Den
6,Corb3 4nd Courn3y B3for3 h3 Kin3ogr4ph,Corbett and Courtney Before the Kinetograph
7,Edi5on Kin3o5copic R3cord of 4 Sn33z3,Edison Kinetoscopic Record of a Sneeze
8,Mi55 J3rry,Miss Jerry
9,L34ving h3 F4cory,Leaving the Factory


In [17]:
display(
    title_basics_sdf
        .select(regexp_replace('genres', 'Documentary|Sport|Animation|Comedy|Romance', '---'), 'genres')
)

Unnamed: 0,"regexp_replace(genres, Documentary|Sport|Animation|Comedy|Romance, ---, 1)",genres
0,"---,Short","Documentary,Short"
1,"---,Short","Animation,Short"
2,"---,---,---","Animation,Comedy,Romance"
3,"---,Short","Animation,Short"
4,"---,Short","Comedy,Short"
5,Short,Short
6,"Short,---","Short,Sport"
7,"---,Short","Documentary,Short"
8,"---,Short","Romance,Short"
9,"---,Short","Documentary,Short"


In [18]:
%%sparksql
select regexp_extract(genres, '(Documentary|Sport|Animation|Comedy|Romance)'), genres from titles limit 10

0,1
"regexp_extract(genres, (Documentary|Sport|Animation|Comedy|Romance), 1)",genres
Documentary,"Documentary,Short"
Animation,"Animation,Short"
Animation,"Animation,Comedy,Romance"
Animation,"Animation,Short"
Comedy,"Comedy,Short"
,Short
Sport,"Short,Sport"
Documentary,"Documentary,Short"
Romance,"Romance,Short"


##### Locate

In [19]:
genre_list = ['Documentary','Sport','Animation','Comedy','Romance']
is_genre_cols = [locate(g, 'genres').alias('is_' + g) for g in genre_list]
is_genre_cols

[Column<'locate(Documentary, genres, 1) AS is_Documentary'>,
 Column<'locate(Sport, genres, 1) AS is_Sport'>,
 Column<'locate(Animation, genres, 1) AS is_Animation'>,
 Column<'locate(Comedy, genres, 1) AS is_Comedy'>,
 Column<'locate(Romance, genres, 1) AS is_Romance'>]

In [20]:
is_genre_cols.append(expr('*'))

display(
    title_basics_sdf
        .select(*is_genre_cols)
)

Unnamed: 0,is_Documentary,is_Sport,is_Animation,is_Comedy,is_Romance,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,1,0,0,0,0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,0,0,1,0,0,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,0,0,1,11,18,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,0,0,1,0,0,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,0,0,0,1,0,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,0,0,0,0,0,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,0,7,0,0,0,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
7,1,0,0,0,0,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,,1,"Documentary,Short"
8,0,0,0,0,1,tt0000009,short,Miss Jerry,Miss Jerry,0,1894,,40,"Romance,Short"
9,1,0,0,0,0,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,,1,"Documentary,Short"


In [21]:
is_genre_cols_boolean = [locate(g, 'genres').cast('Boolean').cast('Integer').alias('is_' + g) for g in genre_list]
is_genre_cols_boolean.append(expr('*'))

display(
    title_basics_sdf
        .select(*is_genre_cols_boolean)
        .where('is_Documentary + is_Sport + is_Animation + is_Comedy + is_Romance >= 3') # at least 3 genres
)

Unnamed: 0,is_Documentary,is_Sport,is_Animation,is_Comedy,is_Romance,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,0,0,1,1,1,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4.0,"Animation,Comedy,Romance"
1,0,0,1,1,1,tt0006533,short,Colonel Heeza Liar's Courtship,Colonel Heeza Liar's Courtship,0,1916,,,"Animation,Comedy,Romance"
2,0,1,0,1,1,tt0014578,movie,The Victor,The Victor,0,1923,,50.0,"Comedy,Romance,Sport"
3,0,1,0,1,1,tt0016226,movie,The Plastic Age,The Plastic Age,0,1925,,73.0,"Comedy,Romance,Sport"
4,0,1,0,1,1,tt0016423,movie,The Thoroughbred,The Thoroughbred,0,1925,,63.0,"Comedy,Romance,Sport"
5,0,0,1,1,1,tt0016587,short,Alice the Fire Fighter,Alice the Fire Fighter,0,1926,,9.0,"Animation,Comedy,Romance"
6,0,0,1,1,1,tt0016790,short,Dinky Doodle in Egypt,Dinky Doodle in Egypt,0,1926,,8.0,"Animation,Comedy,Romance"
7,0,1,0,1,1,tt0016844,movie,Fascinating Youth,Fascinating Youth,0,1926,,70.0,"Comedy,Romance,Sport"
8,0,1,0,1,1,tt0016994,movie,In Borrowed Plumes,In Borrowed Plumes,0,1926,,60.0,"Comedy,Romance,Sport"
9,0,1,0,1,1,tt0018267,movie,Pleasure Before Business,Pleasure Before Business,0,1927,,55.0,"Comedy,Romance,Sport"


### Nulls

In [22]:
%%sparksql
select
    ifnull(null, 'value'),
    nullif('value', 'value'),
    nvl(null, 'value'),
    nvl2('not_null', 'first_value', 'second_value'),
    nvl2(null, 'first_value', 'second_value')

0,1,2,3,4
"ifnull(NULL, value)","nullif(value, value)","nvl(NULL, value)","nvl2(not_null, first_value, second_value)","nvl2(NULL, first_value, second_value)"
value,,value,first_value,second_value


In [23]:
%%sparksql
select primaryTitle, coalesce(startYear, endYear)
from titles
limit 5

0,1
primaryTitle,"coalesce(startYear, endYear)"
Carmencita,1894
Le clown et ses chiens,1892
Pauvre Pierrot,1892
Un bon bock,1892
Blacksmith Scene,1893


In [24]:
display(
    spark.createDataFrame([Row(None, None, None),
                            Row(1, 2, None),
                            Row(None, None, 3)])
)

                                                                                

Unnamed: 0,_1,_2,_3
0,,,
1,1.0,2.0,
2,,,3.0


In [25]:
# drop row if any value is null
display(
    spark.createDataFrame([Row(None, None, None),
                            Row(1, 2, None),
                            Row(None, None, 3)])
        .na.drop('any')
)

Unnamed: 0,_1,_2,_3


In [26]:
#  drop row where all are nulls
display(
    spark.createDataFrame([Row(None, None, None),
                            Row(1, 2, None),
                            Row(None, None, 3)])
        .na.drop('all')
)

Unnamed: 0,_1,_2,_3
0,1.0,2.0,
1,,,3.0


In [27]:
#  drop if all cols in the subset are null
display(
    spark.createDataFrame([Row(None, None, None),
                            Row(1, 2, None),
                            Row(None, None, 3)])
        .na.drop('all', subset=['_1', '_2'])
)

Unnamed: 0,_1,_2,_3
0,1,2,


In [28]:
display(
    spark.createDataFrame([Row(None, None, None),
                            Row(1, 2, None),
                            Row(None, None, 3)])
        .na.fill(0, subset=['_1', '_2'])
)

Unnamed: 0,_1,_2,_3
0,0,0,
1,1,2,
2,0,0,3.0


### Structs
Dataframe within a dataframe.

In [29]:
struct_df = (
    title_basics_sdf
        .selectExpr('primaryTitle', 'struct(runtimeMinutes, startYear, genres) as movie')
)

display(struct_df)

Unnamed: 0,primaryTitle,movie
0,Carmencita,"(1, 1894, Documentary,Short)"
1,Le clown et ses chiens,"(5, 1892, Animation,Short)"
2,Pauvre Pierrot,"(4, 1892, Animation,Comedy,Romance)"
3,Un bon bock,"(12, 1892, Animation,Short)"
4,Blacksmith Scene,"(1, 1893, Comedy,Short)"
5,Chinese Opium Den,"(1, 1894, Short)"
6,Corbett and Courtney Before the Kinetograph,"(1, 1894, Short,Sport)"
7,Edison Kinetoscopic Record of a Sneeze,"(1, 1894, Documentary,Short)"
8,Miss Jerry,"(40, 1894, Romance,Short)"
9,Leaving the Factory,"(1, 1895, Documentary,Short)"


In [30]:
struct_df.printSchema()

root
 |-- primaryTitle: string (nullable = true)
 |-- movie: struct (nullable = false)
 |    |-- runtimeMinutes: integer (nullable = true)
 |    |-- startYear: integer (nullable = true)
 |    |-- genres: string (nullable = true)



In [31]:
display(
    struct_df
        .select('primaryTitle', 'movie.runtimeMinutes')
)

Unnamed: 0,primaryTitle,runtimeMinutes
0,Carmencita,1
1,Le clown et ses chiens,5
2,Pauvre Pierrot,4
3,Un bon bock,12
4,Blacksmith Scene,1
5,Chinese Opium Den,1
6,Corbett and Courtney Before the Kinetograph,1
7,Edison Kinetoscopic Record of a Sneeze,1
8,Miss Jerry,40
9,Leaving the Factory,1


### Arrays

In [32]:
array_df = (
    title_basics_sdf
        .selectExpr('primaryTitle','split(genres, ",") as genres_array')
)

display(array_df)

Unnamed: 0,primaryTitle,genres_array
0,Carmencita,"[Documentary, Short]"
1,Le clown et ses chiens,"[Animation, Short]"
2,Pauvre Pierrot,"[Animation, Comedy, Romance]"
3,Un bon bock,"[Animation, Short]"
4,Blacksmith Scene,"[Comedy, Short]"
5,Chinese Opium Den,[Short]
6,Corbett and Courtney Before the Kinetograph,"[Short, Sport]"
7,Edison Kinetoscopic Record of a Sneeze,"[Documentary, Short]"
8,Miss Jerry,"[Romance, Short]"
9,Leaving the Factory,"[Documentary, Short]"


In [33]:
array_df.printSchema()

root
 |-- primaryTitle: string (nullable = true)
 |-- genres_array: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [34]:
display(
    array_df
        .selectExpr('primaryTitle', 'genres_array[0]')
)

Unnamed: 0,primaryTitle,genres_array[0]
0,Carmencita,Documentary
1,Le clown et ses chiens,Animation
2,Pauvre Pierrot,Animation
3,Un bon bock,Animation
4,Blacksmith Scene,Comedy
5,Chinese Opium Den,Short
6,Corbett and Courtney Before the Kinetograph,Short
7,Edison Kinetoscopic Record of a Sneeze,Documentary
8,Miss Jerry,Romance
9,Leaving the Factory,Documentary


In [35]:
# size of array
display(
    title_basics_sdf
        .where('size(split(genres, ",")) >= 3')
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4.0,"Animation,Comedy,Romance"
1,tt0000020,short,The Derby 1895,The Derby 1895,0,1895,,1.0,"Documentary,Short,Sport"
2,tt0000025,short,The Oxford and Cambridge University Boat Race,The Oxford and Cambridge University Boat Race,0,1895,,,"News,Short,Sport"
3,tt0000033,short,Horse Trick Riders,La voltige,0,1895,,1.0,"Comedy,Documentary,Short"
4,tt0000038,short,The Ball Game,The Ball Game,0,1898,,,"Documentary,Short,Sport"
5,tt0000041,short,Bataille de neige,Bataille de neige,0,1897,,1.0,"Comedy,Documentary,Short"
6,tt0000131,short,A Terrible Night,Une nuit terrible,0,1896,,1.0,"Comedy,Horror,Short"
7,tt0000138,short,The Bewitched Inn,L'auberge ensorcelée,0,1897,,2.0,"Comedy,Horror,Short"
8,tt0000147,short,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,20.0,"Documentary,News,Short"
9,tt0000152,short,The Hallucinated Alchemist,L'hallucination de l'alchimiste,0,1897,,2.0,"Fantasy,Horror,Short"


In [36]:
# if array contains a value
display(
    title_basics_sdf
        .where('array_contains(split(genres, ","), "Animation")')
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
1,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
2,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
3,tt0000015,short,Autour d'une cabine,Autour d'une cabine,0,1894,,2,"Animation,Short"
4,tt0000233,short,Choque de dos transatlánticos,Choque de dos transatlánticos,0,1899,,2,"Animation,Short"
5,tt0000251,short,Matches: An Appeal,Matches: An Appeal,0,1899,,1,"Animation,Short"
6,tt0000300,short,The Enchanted Drawing,The Enchanted Drawing,0,1900,,2,"Animation,Comedy,Fantasy"
7,tt0000516,short,The Electric Hotel,El hotel eléctrico,0,1908,,8,"Animation,Fantasy,Sci-Fi"
8,tt0000552,short,The Hand of the Artist,The Hand of the Artist,0,1907,,2,"Animation,Short"
9,tt0000554,short,Humorous Phases of Funny Faces,Humorous Phases of Funny Faces,0,1906,,3,"Animation,Comedy,Family"


In [37]:
# explode
display(
    title_basics_sdf
        .selectExpr('primaryTitle', 'explode(split(genres, ","))')
)

Unnamed: 0,primaryTitle,col
0,Carmencita,Documentary
1,Carmencita,Short
2,Le clown et ses chiens,Animation
3,Le clown et ses chiens,Short
4,Pauvre Pierrot,Animation
5,Pauvre Pierrot,Comedy
6,Pauvre Pierrot,Romance
7,Un bon bock,Animation
8,Un bon bock,Short
9,Blacksmith Scene,Comedy


In [38]:
# spark.stop()