# Dataframes

In [1]:
import pyspark
from delta import *
import pandas as pd

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

spark



:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/siladitya/.ivy2/cache
The jars for the packages stored in: /home/siladitya/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d1a7c116-1377-4677-887f-0dbf42d8359b;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.2.1 in central
	found io.delta#delta-storage;1.2.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 600ms :: artifacts dl 13ms
	:: modules in use:
	io.delta#delta-core_2.12;1.2.1 from central in [default]
	io.delta#delta-storage;1.2.1 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnl

In [2]:
# so that Pandas df display all columns, default is 20
pd.set_option('display.max_columns', None)

In [3]:
raw_data_path = '/home/siladitya/Documents/Spark/Spark-Course/Data/imdb/'

In [4]:
title_basics_sdf = (spark
                    .read
                    .option('inferSchema', 'true')
                    .option('header', 'true')
                    .option('delimiter', '\t')
                    .csv(raw_data_path + 'title_basics.tsv'))

# pretty display
title_basics_sdf.limit(10).toPandas().head(10)

                                                                                

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,short,Miss Jerry,Miss Jerry,0,1894,\N,40,"Romance,Short"
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


### Schema

In [5]:
title_basics_sdf.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [6]:
# a schema *is* a StructType 
title_basics_sdf.schema

StructType(List(StructField(tconst,StringType,true),StructField(titleType,StringType,true),StructField(primaryTitle,StringType,true),StructField(originalTitle,StringType,true),StructField(isAdult,StringType,true),StructField(startYear,StringType,true),StructField(endYear,StringType,true),StructField(runtimeMinutes,StringType,true),StructField(genres,StringType,true)))

In [7]:
# the list within the schema
title_basics_sdf.schema.fields

[StructField(tconst,StringType,true),
 StructField(titleType,StringType,true),
 StructField(primaryTitle,StringType,true),
 StructField(originalTitle,StringType,true),
 StructField(isAdult,StringType,true),
 StructField(startYear,StringType,true),
 StructField(endYear,StringType,true),
 StructField(runtimeMinutes,StringType,true),
 StructField(genres,StringType,true)]

In [8]:
# field names
title_basics_sdf.schema.names

['tconst',
 'titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [9]:
# same as above
title_basics_sdf.columns

['tconst',
 'titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [10]:
# JSON
title_basics_sdf.schema.jsonValue()

{'type': 'struct',
 'fields': [{'name': 'tconst',
   'type': 'string',
   'nullable': True,
   'metadata': {}},
  {'name': 'titleType', 'type': 'string', 'nullable': True, 'metadata': {}},
  {'name': 'primaryTitle', 'type': 'string', 'nullable': True, 'metadata': {}},
  {'name': 'originalTitle',
   'type': 'string',
   'nullable': True,
   'metadata': {}},
  {'name': 'isAdult', 'type': 'string', 'nullable': True, 'metadata': {}},
  {'name': 'startYear', 'type': 'string', 'nullable': True, 'metadata': {}},
  {'name': 'endYear', 'type': 'string', 'nullable': True, 'metadata': {}},
  {'name': 'runtimeMinutes',
   'type': 'string',
   'nullable': True,
   'metadata': {}},
  {'name': 'genres', 'type': 'string', 'nullable': True, 'metadata': {}}]}

In [11]:
# read data using manual schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

title_basics_schema = StructType([
                            StructField('tconst', StringType(), False),
                            StructField('titleType', StringType(), True),
                            StructField('primaryTitle', StringType(), True),
                            StructField('originalTitle', StringType(), True),
                            StructField('isAdult', IntegerType(), True),
                            StructField('startYear', IntegerType(), True),
                            StructField('endYear', IntegerType(), True),
                            StructField('runtimeMinutes', IntegerType(), True),
                            StructField('genres', StringType(), True)])

title_basics_sdf = (spark
                    .read
                    .schema(title_basics_schema)
                    .option('header', 'true')
                    .option('delimiter', '\t')
                    .csv(raw_data_path + 'title_basics.tsv'))

title_basics_sdf.limit(10).toPandas().head(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,,1,"Documentary,Short"
8,tt0000009,short,Miss Jerry,Miss Jerry,0,1894,,40,"Romance,Short"
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,,1,"Documentary,Short"


In [12]:
title_basics_sdf.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- endYear: integer (nullable = true)
 |-- runtimeMinutes: integer (nullable = true)
 |-- genres: string (nullable = true)



### Rows

In [13]:
# create a Row object
from pyspark.sql import Row
my_movie = Row('tt0000000',	'movie', 'Hometown Hero', 'Hometown Hero',	0,	2010, None,	90,	'Documentary')

# can be accessed like a list
*my_movie[3::2], my_movie[-1]

('Hometown Hero', 2010, 90, 'Documentary')

In [14]:
type(my_movie)

pyspark.sql.types.Row

In [15]:
# create a DataFrame object from a list of Rows
my_movie_sdf = spark.createDataFrame([my_movie], title_basics_sdf.schema)
my_movie_sdf.show()

[Stage 4:>                                                          (0 + 1) / 1]

+---------+---------+-------------+-------------+-------+---------+-------+--------------+-----------+
|   tconst|titleType| primaryTitle|originalTitle|isAdult|startYear|endYear|runtimeMinutes|     genres|
+---------+---------+-------------+-------------+-------+---------+-------+--------------+-----------+
|tt0000000|    movie|Hometown Hero|Hometown Hero|      0|     2010|   null|            90|Documentary|
+---------+---------+-------------+-------------+-------+---------+-------+--------------+-----------+



                                                                                

##### Append Rows

In [16]:
# create new row
my_movie_2 = Row('tt000000',	'movie', 'Hometown Hero: the Beginning', 'Hometown Hero 0',	0,	2012, None,	110,	'Documentary,Romance')

parallelized_my_movies = spark.sparkContext.parallelize([my_movie, my_movie_2])

parallelized_my_movies

ParallelCollectionRDD[23] at readRDDFromFile at PythonRDD.scala:274

In [17]:
my_movies_sdf = spark.createDataFrame(parallelized_my_movies, schema=title_basics_sdf.schema)
my_movies_sdf.show()

+---------+---------+--------------------+---------------+-------+---------+-------+--------------+-------------------+
|   tconst|titleType|        primaryTitle|  originalTitle|isAdult|startYear|endYear|runtimeMinutes|             genres|
+---------+---------+--------------------+---------------+-------+---------+-------+--------------+-------------------+
|tt0000000|    movie|       Hometown Hero|  Hometown Hero|      0|     2010|   null|            90|        Documentary|
| tt000000|    movie|Hometown Hero: th...|Hometown Hero 0|      0|     2012|   null|           110|Documentary,Romance|
+---------+---------+--------------------+---------------+-------+---------+-------+--------------+-------------------+



In [18]:
(my_movies_sdf
    .union(title_basics_sdf)
    .show(5))

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000000|    movie|       Hometown Hero|       Hometown Hero|      0|     2010|   null|            90|         Documentary|
| tt000000|    movie|Hometown Hero: th...|     Hometown Hero 0|      0|     2012|   null|           110| Documentary,Romance|
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Come

##### Sort Rows

In [19]:
from pyspark.sql.functions import col, expr

(title_basics_sdf
    .orderBy(expr('endYear desc'))
    .show(5))



+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|   null|             1|        Comedy

                                                                                

In [20]:
(title_basics_sdf
    .orderBy(col('endYear').desc_nulls_last())
    .show(5))



+----------+------------+--------------------+--------------------+-------+---------+-------+--------------+---------------+
|    tconst|   titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|         genres|
+----------+------------+--------------------+--------------------+-------+---------+-------+--------------+---------------+
|tt14127838|tvMiniSeries|             Journey|             Journey|      0|     2021|   2028|            55|   Comedy,Drama|
| tt7358768|    tvSeries|               The Y|               The Y|      0|     2017|   2027|             6|Sci-Fi,Thriller|
|tt10585548|    tvSeries|  Edjucatin' Charlie|  Edjucatin' Charlie|      0|     2020|   2026|            30|         Comedy|
|tt15680592|tvMiniSeries|                Kush|                Kush|      0|     2021|   2025|            20|          Crime|
| tt4681850|    tvSeries|Dreamers and Diss...|Dreamers and Diss...|      0|     2015|   2025|          null|    Documentary|


                                                                                

In [21]:
# optimization: sort within partitions before another set of transformations
(title_basics_sdf
    .sortWithinPartitions('startYear'))

DataFrame[tconst: string, titleType: string, primaryTitle: string, originalTitle: string, isAdult: int, startYear: int, endYear: int, runtimeMinutes: int, genres: string]

##### Replace

In [24]:
(spark.createDataFrame([Row(None, None, None),
                        Row(1, 2, None),
                        Row(None, None, 3)])
    .replace(1.0, 10.0)
    .show())

+----+----+----+
|  _1|  _2|  _3|
+----+----+----+
|null|null|null|
|  10|   2|null|
|null|null|   3|
+----+----+----+



### Collect Rows

In [25]:
collect_df = title_basics_sdf.limit(10)
collect_df.take(5)

[Row(tconst='tt0000001', titleType='short', primaryTitle='Carmencita', originalTitle='Carmencita', isAdult=0, startYear=1894, endYear=None, runtimeMinutes=1, genres='Documentary,Short'),
 Row(tconst='tt0000002', titleType='short', primaryTitle='Le clown et ses chiens', originalTitle='Le clown et ses chiens', isAdult=0, startYear=1892, endYear=None, runtimeMinutes=5, genres='Animation,Short'),
 Row(tconst='tt0000003', titleType='short', primaryTitle='Pauvre Pierrot', originalTitle='Pauvre Pierrot', isAdult=0, startYear=1892, endYear=None, runtimeMinutes=4, genres='Animation,Comedy,Romance'),
 Row(tconst='tt0000004', titleType='short', primaryTitle='Un bon bock', originalTitle='Un bon bock', isAdult=0, startYear=1892, endYear=None, runtimeMinutes=12, genres='Animation,Short'),
 Row(tconst='tt0000005', titleType='short', primaryTitle='Blacksmith Scene', originalTitle='Blacksmith Scene', isAdult=0, startYear=1893, endYear=None, runtimeMinutes=1, genres='Comedy,Short')]

In [26]:
collect_df.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|   null|             1|        Comedy

In [27]:
collect_df.show(5, False)

+---------+---------+----------------------+----------------------+-------+---------+-------+--------------+------------------------+
|tconst   |titleType|primaryTitle          |originalTitle         |isAdult|startYear|endYear|runtimeMinutes|genres                  |
+---------+---------+----------------------+----------------------+-------+---------+-------+--------------+------------------------+
|tt0000001|short    |Carmencita            |Carmencita            |0      |1894     |null   |1             |Documentary,Short       |
|tt0000002|short    |Le clown et ses chiens|Le clown et ses chiens|0      |1892     |null   |5             |Animation,Short         |
|tt0000003|short    |Pauvre Pierrot        |Pauvre Pierrot        |0      |1892     |null   |4             |Animation,Comedy,Romance|
|tt0000004|short    |Un bon bock           |Un bon bock           |0      |1892     |null   |12            |Animation,Short         |
|tt0000005|short    |Blacksmith Scene      |Blacksmith Scene  

In [28]:
collect_df.collect()

[Row(tconst='tt0000001', titleType='short', primaryTitle='Carmencita', originalTitle='Carmencita', isAdult=0, startYear=1894, endYear=None, runtimeMinutes=1, genres='Documentary,Short'),
 Row(tconst='tt0000002', titleType='short', primaryTitle='Le clown et ses chiens', originalTitle='Le clown et ses chiens', isAdult=0, startYear=1892, endYear=None, runtimeMinutes=5, genres='Animation,Short'),
 Row(tconst='tt0000003', titleType='short', primaryTitle='Pauvre Pierrot', originalTitle='Pauvre Pierrot', isAdult=0, startYear=1892, endYear=None, runtimeMinutes=4, genres='Animation,Comedy,Romance'),
 Row(tconst='tt0000004', titleType='short', primaryTitle='Un bon bock', originalTitle='Un bon bock', isAdult=0, startYear=1892, endYear=None, runtimeMinutes=12, genres='Animation,Short'),
 Row(tconst='tt0000005', titleType='short', primaryTitle='Blacksmith Scene', originalTitle='Blacksmith Scene', isAdult=0, startYear=1893, endYear=None, runtimeMinutes=1, genres='Comedy,Short'),
 Row(tconst='tt00000

### Columns

In [29]:
# access a Column
title_basics_sdf.select('primaryTitle').show(5)

+--------------------+
|        primaryTitle|
+--------------------+
|          Carmencita|
|Le clown et ses c...|
|      Pauvre Pierrot|
|         Un bon bock|
|    Blacksmith Scene|
+--------------------+
only showing top 5 rows



In [30]:
from pyspark.sql.functions import col, expr

title_basics_sdf.select(col('runtimeMinutes')*60).show(5)

+---------------------+
|(runtimeMinutes * 60)|
+---------------------+
|                   60|
|                  300|
|                  240|
|                  720|
|                   60|
+---------------------+
only showing top 5 rows



In [31]:
type(col('primaryTitle'))

pyspark.sql.column.Column

In [32]:
# expr is like a SELECT statement
title_basics_sdf.select(expr('runtimeMinutes*60 as runtimeSeconds')).show(5)

+--------------+
|runtimeSeconds|
+--------------+
|            60|
|           300|
|           240|
|           720|
|            60|
+--------------+
only showing top 5 rows



In [33]:
# so that we can use the SQL API
title_basics_sdf.createOrReplaceTempView('titles')

In [34]:
%load_ext sparksql_magic

In [35]:
%%sparksql
select runtimeMinutes*60 as runtimeSeconds from titles limit 5

0
runtimeSeconds
60
300
240
720
60


In [36]:
# selectExpr() = select(expr())
# add a column 
(title_basics_sdf.selectExpr('*', # all cols
                            'runtimeMinutes*60 as runtimeSeconds')
                .show(5))

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+--------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|runtimeSeconds|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+--------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|            60|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|           300|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|           240|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|           720|
|tt0000005|  

In [37]:
# passing literals (and not columns)
from pyspark.sql.functions import lit

title_basics_sdf.select(expr('*'), lit('IMDB').alias('database')).show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+--------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|database|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+--------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|    IMDB|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|    IMDB|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|    IMDB|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|    IMDB|
|tt0000005|    short|    Blacksmith Scene|    Blacksmit

In [38]:
title_basics_sdf.select(col('primaryTitle'), col('runtimeMinutes') >= lit(5)).show(5)

+--------------------+---------------------+
|        primaryTitle|(runtimeMinutes >= 5)|
+--------------------+---------------------+
|          Carmencita|                false|
|Le clown et ses c...|                 true|
|      Pauvre Pierrot|                false|
|         Un bon bock|                 true|
|    Blacksmith Scene|                false|
+--------------------+---------------------+
only showing top 5 rows



In [39]:
# add a column
title_basics_sdf.withColumn('runtimeSeconds', expr('runtimeMinutes*60')).show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+--------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|runtimeSeconds|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+--------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|            60|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|           300|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|           240|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|           720|
|tt0000005|  

In [40]:
# rename a column
title_basics_sdf.withColumnRenamed('primaryTitle', 'title').show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|               title|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|   null|             1|        Comedy

In [41]:
#drop a column
title_basics_sdf.drop('primaryTitle').show(5)

+---------+---------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|      0|     1893|   null|             1|        Comedy,Short|
+---------+---------+--------------------+-------+---------+-------+--------------+--------------------+
only showing top 5 rows



In [42]:
# change a column's type
title_basics_sdf.withColumn('startYear_str', col('startYear').cast('string')).printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- endYear: integer (nullable = true)
 |-- runtimeMinutes: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- startYear_str: string (nullable = true)



In [43]:
%%sparksql
-- equivalent to this
select *, cast(startYear as string) from titles limit 5


0,1,2,3,4,5,6,7,8,9
tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,startYear
tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short",1894
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short",1892
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance",1892
tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short",1892
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short",1893


In [44]:
# filter rows, like WHERE clause
title_basics_sdf.where(col('isAdult') == 1).show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|     genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------+
|tt0060313|    movie|      Dingle, Dangle|      Dingle, Dangle|      1|     1966|   null|            61|     Comedy|
|tt0061926|    movie|        Lust Weekend|        Lust Weekend|      1|     1967|   null|            70|      Drama|
|tt0062361|    movie|           Thigh Spy|           Thigh Spy|      1|     1967|   null|            70|      Drama|
|tt0062417|    movie|Un épais manteau ...|Un épais manteau ...|      1|     1968|   null|            88|      Drama|
|tt0062727|    short|    Of Special Merit|  Besonders wertvoll|      1|     1968|   null|            11|Adult,Short|
+---------+---------+--------------------+--------------------+-

In [46]:
# unique/distinct
title_basics_sdf.select('titleType').distinct().show()

[Stage 44:>                                                         (0 + 8) / 8]

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
|     tvPilot|
+------------+



                                                                                

### Random Sampling

In [47]:
# random samples
seed = 100
withReplacement = False
fraction = 0.5
title_basics_sdf.sample(withReplacement, fraction, seed).count()

                                                                                

4440200

In [48]:
# random splits, useful in train-test splits
split_df_list = title_basics_sdf.limit(10000).randomSplit([0.25, 0.75], seed)
split_df_list[0].count(), split_df_list[1].count()

                                                                                

(2537, 7463)

### Repartition and Coalesce

In [49]:
# make 5 partitions based on the column titleType
# incurs a full shuffle (wide transform)
title_basics_repartitioned_sdf = title_basics_sdf.repartition(5, col('titleType'))

In [50]:
# `join` up, or coalesce, partitions
# narrow transform
title_basics_repartitioned_sdf.coalesce(2)

DataFrame[tconst: string, titleType: string, primaryTitle: string, originalTitle: string, isAdult: int, startYear: int, endYear: int, runtimeMinutes: int, genres: string]

In [51]:
# spark.stop()