# Create Spark session

In [1]:
import findspark
findspark.init()

# Create a spark-session (akin to what pyspark provides when it is started)
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

# Checking Spark session options

In [2]:
sc = spark.sparkContext
sc.getConf().getAll()

[('spark.history.kerberos.keytab', 'none'),
 ('spark.ui.proxyBase', '/proxy/application_1596825752622_0002'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.driver.port', '40435'),
 ('spark.driver.appUIAddress', 'http://slalomdsvm:4040'),
 ('spark.history.ui.port', '18081'),
 ('spark.driver.memory', '512M'),
 ('spark.driver.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.history.fs.cleaner.interval', '7d'),
 ('spark.shuffle.io.serverThreads', '128'),
 ('spark.yarn.historyServer.address', 'slalomdsvm:18081'),
 ('spark.sql.streaming.streamingQueryListeners', ''),
 ('spark.executor.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.sql.statistics.fallBackToHdfs', 'true'),
 ('spark.executorEnv.PYTHONPATH',
  '{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.7-src.zip'),
 ('spark.shuffle.file.buffer', '1m'),
 ('spark.app.id', 'applica

# Imports

In [3]:
import pandas as pd

%matplotlib inline

# Create and see dataframes

In [4]:
col_names = ['id', 'first_name', 'last_name']
rows = [
    (1, 'John', 'Doe'),
    (1, 'John', 'Doe'), 
    (1, 'John', None), 
    (2, 'Jane', 'Doe'),
    (3, 'Herbie', 'Hancock'),
    (4, 'Erin', 'brockovich'),        
]

df1 = spark.createDataFrame(rows, col_names)

In [5]:
col_names = ['id', 'number_sox']
rows = [
    (1, 24),
    (2, 30),
    (3, 29),
    (4, 40),        
]

df2 = spark.createDataFrame(rows, col_names)

# See the dataframes

In [6]:
df1.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)



In [7]:
df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- number_sox: long (nullable = true)



Show the DF:

  * Asking for `df1` only shows the reference to the object. This is beacuse of Spark's lazy evaluation: the dataframe has not been created yet (`createDataFrame()` is a transformation).
  * The dataframe will only be created when we apply an action to it like `show()`.
  * Note the difference in duration between the 2 cells.

In [8]:
df1

DataFrame[id: bigint, first_name: string, last_name: string]

In [9]:
df1.show()

+---+----------+----------+
| id|first_name| last_name|
+---+----------+----------+
|  1|      John|       Doe|
|  1|      John|       Doe|
|  1|      John|      null|
|  2|      Jane|       Doe|
|  3|    Herbie|   Hancock|
|  4|      Erin|brockovich|
+---+----------+----------+



In [10]:
df2

DataFrame[id: bigint, number_sox: bigint]

In [11]:
df2.show()

+---+----------+
| id|number_sox|
+---+----------+
|  1|        24|
|  2|        30|
|  3|        29|
|  4|        40|
+---+----------+



# Joining dataframes

In [12]:
df = df1.join(df2, 'id', how = 'inner')

df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  1|      John|       Doe|        24|
|  1|      John|      null|        24|
|  3|    Herbie|   Hancock|        29|
|  2|      Jane|       Doe|        30|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# Keeping distinct values

In [13]:
df = df.distinct() \

df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  1|      John|      null|        24|
|  3|    Herbie|   Hancock|        29|
|  2|      Jane|       Doe|        30|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# Removing rows containing `null` values

In [14]:
df = df.dropna()

df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  3|    Herbie|   Hancock|        29|
|  2|      Jane|       Doe|        30|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# Selecting columns

Get a column object with:

In [15]:
df.id

Column<b'id'>

In [16]:
selected_df = df.select('id', 'first_name')

selected_df.show()

+---+----------+
| id|first_name|
+---+----------+
|  1|      John|
|  3|    Herbie|
|  2|      Jane|
|  4|      Erin|
+---+----------+



# Filtering rows

In [17]:
filtered_df = df.filter((df.number_sox <= 25) | (df.number_sox >= 35))

filtered_df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# Randomizing and ordering rows

In [18]:
from pyspark.sql.functions import rand

randomized_df = df.orderBy(rand())
randomized_df.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  2|      Jane|       Doe|        30|
|  3|    Herbie|   Hancock|        29|
|  4|      Erin|brockovich|        40|
|  1|      John|       Doe|        24|
+---+----------+----------+----------+



In [19]:
randomized_df.orderBy('id').show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  2|      Jane|       Doe|        30|
|  3|    Herbie|   Hancock|        29|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



In [20]:
randomized_df \
.orderBy('last_name', 'number_sox') \
.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  1|      John|       Doe|        24|
|  2|      Jane|       Doe|        30|
|  3|    Herbie|   Hancock|        29|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



In [21]:
from pyspark.sql.functions import desc

randomized_df \
.orderBy('last_name', desc('number_sox')) \
.show()

+---+----------+----------+----------+
| id|first_name| last_name|number_sox|
+---+----------+----------+----------+
|  2|      Jane|       Doe|        30|
|  1|      John|       Doe|        24|
|  3|    Herbie|   Hancock|        29|
|  4|      Erin|brockovich|        40|
+---+----------+----------+----------+



# User Defined Functions (UDFs) and Row-wise operations ("Lamda" functions)

UDFs can be defined to be re-used during row-wise operations (ex: Lambda functions in Pandas) on a dataframe.

In [22]:
# Define your function

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType


# Define the UDF using a "classic" function
def sox_pairs(x):
    return round(float(x)/2.0)
    
sox_pairs_udf = udf(lambda x: sox_pairs(x), IntegerType())

# OR 
# Define the UDF using a lambda function
sox_pairs_udf_l = udf(lambda x: round(float(x)/2.0), IntegerType())

In [23]:
# Apply the UDFs to the DF

df \
.withColumn("number_sox_pair", sox_pairs_udf("number_sox")) \
.show()

+---+----------+----------+----------+---------------+
| id|first_name| last_name|number_sox|number_sox_pair|
+---+----------+----------+----------+---------------+
|  1|      John|       Doe|        24|             12|
|  3|    Herbie|   Hancock|        29|             14|
|  2|      Jane|       Doe|        30|             15|
|  4|      Erin|brockovich|        40|             20|
+---+----------+----------+----------+---------------+



In [24]:
df_with_pairs = df \
.withColumn("number_sox_pair", sox_pairs_udf_l("number_sox"))

df_with_pairs.show()

+---+----------+----------+----------+---------------+
| id|first_name| last_name|number_sox|number_sox_pair|
+---+----------+----------+----------+---------------+
|  1|      John|       Doe|        24|             12|
|  3|    Herbie|   Hancock|        29|             14|
|  2|      Jane|       Doe|        30|             15|
|  4|      Erin|brockovich|        40|             20|
+---+----------+----------+----------+---------------+



# Aggregation

Note: The concept of UDF can be taken one step futher to be applicable to aggregation situation, these UDFs are called UDAFs (User Defined Aggregated Functions). Ex: if you want to apply an advanced algorithm to a dataframe in an aggregated fashion: you would need to create a UDAF which implements the algorithm but is also taking care of grouping. This is an expert topic and is not covered in this training.

In [25]:
from pyspark.sql.functions import sum

counts_df = df  \
.groupBy('last_name') \
.agg(sum('number_sox') \
.alias('number_sox_per_family'))

counts_df.show()

+----------+---------------------+
| last_name|number_sox_per_family|
+----------+---------------------+
|   Hancock|                   29|
|       Doe|                   54|
|brockovich|                   40|
+----------+---------------------+



# Exploding a "list" column into rows

If a column contains a list, it can be expanded into multiple rows, one row per item in the list.

In [26]:
col_names = ['id', 'list']
rows = [
    (1, ['A', 'B']),
    (2, ['C']),
    (3, ['D', 'D']),
    (4, ['E', 'F']),        
]

list_df = spark.createDataFrame(rows, col_names)
list_df.show()

+---+------+
| id|  list|
+---+------+
|  1|[A, B]|
|  2|   [C]|
|  3|[D, D]|
|  4|[E, F]|
+---+------+



In [27]:
from pyspark.sql.functions import explode

exploded_df = list_df.select('id', explode('list').alias('item'))
exploded_df.show()

+---+----+
| id|item|
+---+----+
|  1|   A|
|  1|   B|
|  2|   C|
|  3|   D|
|  3|   D|
|  4|   E|
|  4|   F|
+---+----+



# Collapsing multiple rows into a "list" column

Conversly, multiple rows can be collapsed into a list or a set with one row per list or set. Order is conserved for  lists, sets do have the concept of order so the original ordering information from the rows will be lost.

In [28]:
from pyspark.sql.functions import collect_list

exploded_df \
.groupBy('id') \
.agg(collect_list('item') \
.alias('list')) \
.show()

+---+------+
| id|  list|
+---+------+
|  1|[A, B]|
|  3|[D, D]|
|  2|   [C]|
|  4|[E, F]|
+---+------+



In [29]:
from pyspark.sql.functions import collect_set

exploded_df \
.groupBy('id') \
.agg(collect_set('item') \
.alias('set')) \
.show()

+---+------+
| id|   set|
+---+------+
|  1|[B, A]|
|  3|   [D]|
|  2|   [C]|
|  4|[F, E]|
+---+------+



# Loading data from a CSV file on HDFS into a Spark dataframe

In [30]:
csv_path = "/user/vagrant/data/earth-surface-temperature/csv/GlobalLandTemperaturesByMajorCity.csv"

temperature_df = spark \
.read \
.option("header", "true") \
.option("inferschema", "true") \
.option("mode", "DROPMALFORMED") \
.csv(csv_path)

temperature_df.show()

+-------------------+------------------+-----------------------------+-------+-------------+--------+---------+
|                 dt|AverageTemperature|AverageTemperatureUncertainty|   City|      Country|Latitude|Longitude|
+-------------------+------------------+-----------------------------+-------+-------------+--------+---------+
|1849-01-01 00:00:00|            26.704|                        1.435|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-02-01 00:00:00|            27.434|                        1.362|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-03-01 00:00:00|            28.101|                        1.612|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-04-01 00:00:00|             26.14|           1.3869999999999998|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-05-01 00:00:00|            25.427|                          1.2|Abidjan|Côte D'Ivoire|   5.63N|    3.23W|
|1849-06-01 00:00:00|            24.844|                        1.402|Abidjan|Côte D'Ivoire|   5.63N|   

# Sampling rows

In [31]:
temperature_df.count()

239177

In [32]:
sampled_df = temperature_df.sample(fraction = 0.1, seed = 1234)
sampled_df.count()

23828

# Writing a Spark dataframe to a parquet file on HDFS

Parquet is a great format to persist tabular data. It performs especially well for dataframes which have columns with values repeating on contigous rows.

In [33]:
parquet_path = "/user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity"

In [34]:
# Default write, will write to multiple files (due to number reducers in Spark's internal architecture)
temperature_df \
.write \
.parquet(parquet_path)

In [35]:
! hdfs dfs -ls -R -h /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet

drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan
-rw-r--r--   1 vagrant hdfs     21.0 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan/part-00000-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     15.2 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan/part-00001-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Angola
-rw-r--r--   1 vagrant hdfs      4.6 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Angola/part-00001-b7b0

-rw-r--r--   1 vagrant hdfs      9.3 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Chile/part-00027-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.8 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Chile/part-00028-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs      4.8 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Chile/part-00029-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=China
-rw-r--r--   1 vagrant hdfs     18.1 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMaj

-rw-r--r--   1 vagrant hdfs     10.7 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Morocco/part-00128-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     20.1 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Morocco/part-00129-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.0 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Morocco/part-00130-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs      1.7 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Morocco/part-00131-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 

drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Vietnam
-rw-r--r--   1 vagrant hdfs      5.7 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Vietnam/part-00196-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     20.2 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Vietnam/part-00197-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     10.6 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Vietnam/part-00198-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTempe

To control the number of output files:

In [36]:
n_files = 1

temperature_df \
.coalesce(n_files) \
.write \
.mode('overwrite') \
.parquet(parquet_path)

In [37]:
! hdfs dfs -ls -R -h /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet

drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan
-rw-r--r--   1 vagrant hdfs     21.0 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan/part-00000-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     15.2 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan/part-00001-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Angola
-rw-r--r--   1 vagrant hdfs      4.6 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Angola/part-00001-b7b0d6d7

-rw-r--r--   1 vagrant hdfs     18.9 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=China/part-00037-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     20.4 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=China/part-00038-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.6 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=China/part-00039-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     20.9 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=China/part-00040-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     18.6 K 2020-08-07 15:29 /user/va

-rw-r--r--   1 vagrant hdfs     11.2 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Ethiopia/part-00074-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.1 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Ethiopia/part-00075-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=France
-rw-r--r--   1 vagrant hdfs      2.0 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=France/part-00075-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.8 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTempe

-rw-r--r--   1 vagrant hdfs     12.5 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Kenya/part-00125-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     17.0 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Kenya/part-00126-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Mexico
-rw-r--r--   1 vagrant hdfs      4.6 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Mexico/part-00126-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     20.2 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperature

-rw-r--r--   1 vagrant hdfs      7.9 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Somalia/part-00157-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.6 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Somalia/part-00158-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs      4.4 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Somalia/part-00159-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=South Africa
-rw-r--r--   1 vagrant hdfs     16.7 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLand

-rw-r--r--   1 vagrant hdfs     18.3 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Turkey/part-00183-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs      1.8 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Turkey/part-00184-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Ukraine
-rw-r--r--   1 vagrant hdfs     17.7 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Ukraine/part-00184-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     20.5 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTempera

Parquet also provides a very convenient partitioning functionality, the data for each single value in the partition will be under its own directory on HDFS.
In-file indexing  is comping up but requires installation of additional libraries and is not ubiquitous yet.

In [38]:
temperature_df \
.orderBy('Country', 'City', 'dt') \
.write \
.partitionBy('Country') \
.mode('overwrite') \
.parquet(parquet_path)

In [39]:
! hdfs dfs -ls -R -h /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet

drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan
-rw-r--r--   1 vagrant hdfs     21.0 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan/part-00000-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     15.2 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Afghanistan/part-00001-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Angola
-rw-r--r--   1 vagrant hdfs      4.6 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Angola/part-00001-b7b0d6d7

-rw-r--r--   1 vagrant hdfs      7.3 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Germany/part-00078-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     22.1 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Germany/part-00079-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.5 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Germany/part-00080-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs      5.9 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Germany/part-00081-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs          0 2020-08-07 15:29 

-rw-r--r--   1 vagrant hdfs     16.7 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=South Africa/part-00159-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     20.9 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=South Africa/part-00160-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     19.9 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=South Africa/part-00161-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs      5.7 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=South Africa/part-00162-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
drwxr-xr-x   - vagrant hdfs         

-rw-r--r--   1 vagrant hdfs     10.8 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Zimbabwe/part-00198-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs     18.8 K 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/Country=Zimbabwe/part-00199-b7b0d6d7-4c86-4a2c-9d7d-fa84bc0f57c6.c000.snappy.parquet
-rw-r--r--   1 vagrant hdfs          0 2020-08-07 15:29 /user/vagrant/data/earth-surface-temperature/parquet/GlobalLandTemperaturesByMajorCity.parquet/_SUCCESS


Notes: In general, we want to create files with a size of a few times the HDFS block-size (default: 128MB). We want a few large files (YES: ≈100+MB -> ≈1GB), not many small files (NO: ≈1KB -> ≈10MB). 

# Collecting a Spark dataframe into a "regular" pandas dataframe

In [40]:
# Collect as a list of pyspark.sql.Rows
collected = df.collect()
collected

[Row(id=1, first_name='John', last_name='Doe', number_sox=24),
 Row(id=3, first_name='Herbie', last_name='Hancock', number_sox=29),
 Row(id=2, first_name='Jane', last_name='Doe', number_sox=30),
 Row(id=4, first_name='Erin', last_name='brockovich', number_sox=40)]

In [41]:
# Collect into a "classic" pandas dataframe
pandas_df = df.toPandas()
pandas_df

Unnamed: 0,id,first_name,last_name,number_sox
0,1,John,Doe,24
1,3,Herbie,Hancock,29
2,2,Jane,Doe,30
3,4,Erin,brockovich,40


# Closing the spak session

Close the `spark` session to destroy the release cluster CPU/RAM resources (completes YARN application and destroys executors, driver JVM).

In [42]:
spark.stop()