# DataFrame

## Spark Session

In [152]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

## Read Data Frame

In [153]:
df = (spark
      .read
      .parquet('data/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [10]:
# materialize 5 rows
df.select('ORIGIN_COUNTRY_NAME').take(5)

[Row(ORIGIN_COUNTRY_NAME='Romania'),
 Row(ORIGIN_COUNTRY_NAME='Ireland'),
 Row(ORIGIN_COUNTRY_NAME='India'),
 Row(ORIGIN_COUNTRY_NAME='United States'),
 Row(ORIGIN_COUNTRY_NAME='United States')]

In [11]:
# materialize all
df.select('DEST_COUNTRY_NAME').collect()

[Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='Egypt'),
 Row(DEST_COUNTRY_NAME='Equatorial Guinea'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='Costa Rica'),
 Row(DEST_COUNTRY_NAME='Senegal'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='Guyana'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='Malta'),
 Row(DEST_COUNTRY_NAME='Bolivia'),
 Row(DEST_COUNTRY_NAME='Anguilla'),
 Row(DEST_COUNTRY_NAME='Turks and Caicos Islands'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='Saint Vincent and the Grenadines'),
 Row(DEST_COUNTRY_NAME='Italy'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='Pakistan'),
 Row(DEST_COUNTRY_NAME='United States'),
 Row(DEST_COUNTRY_NAME='Iceland'),
 Row(DEST_COUNTRY_NAME='Marshall Islands'),


## Foreach
http://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.foreach

Applies the f function to all Row of this DataFrame.

In [49]:
def f(x): 
    print(x)

df.foreach(f) 

## Distinct Set

In [54]:
df.select('DEST_COUNTRY_NAME').distinct().count()

125

## Filter

In [62]:
from pyspark.sql.functions import col, desc
df.filter(df['count'] > 1).sort(desc('count')).show()

+------------------+-------------------+------+
| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+------------------+-------------------+------+
|     United States|      United States|348113|
|     United States|             Canada|  8305|
|            Canada|      United States|  8271|
|     United States|             Mexico|  6220|
|            Mexico|      United States|  6200|
|    United Kingdom|      United States|  1629|
|     United States|     United Kingdom|  1503|
|     United States|            Germany|  1406|
|           Germany|      United States|  1392|
|             Japan|      United States|  1383|
|     United States|              Japan|  1307|
|     United States| Dominican Republic|  1150|
|Dominican Republic|      United States|  1109|
|            Brazil|      United States|   995|
|     United States|        The Bahamas|   959|
|       The Bahamas|      United States|   903|
|     United States|           Colombia|   832|
|          Colombia|      United States|

In [63]:
df.filter((df['count'] > 1) & (df['ORIGIN_COUNTRY_NAME'] == 'United States')).sort(desc('count')).show()

+------------------+-------------------+------+
| DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+------------------+-------------------+------+
|     United States|      United States|348113|
|            Canada|      United States|  8271|
|            Mexico|      United States|  6200|
|    United Kingdom|      United States|  1629|
|           Germany|      United States|  1392|
|             Japan|      United States|  1383|
|Dominican Republic|      United States|  1109|
|            Brazil|      United States|   995|
|       The Bahamas|      United States|   903|
|          Colombia|      United States|   785|
|            France|      United States|   774|
|           Jamaica|      United States|   733|
|       South Korea|      United States|   683|
|       Netherlands|      United States|   586|
|       El Salvador|      United States|   519|
|        Costa Rica|      United States|   477|
|             China|      United States|   448|
|             Spain|      United States|

## Repartitioning
https://medium.com/@mrpowers/managing-spark-partitions-with-coalesce-and-repartition-4050c57ad5c4

In [78]:
# df with 10 partitions
df10 = df.repartition(10)
df10.rdd.getNumPartitions()

10

## Coalescing
Allows decrease number of partitions in a cheap way, avoiding full shuffle.
https://stackoverflow.com/a/31612810

In [83]:
# df with one partition
df10.coalesce(2).rdd.getNumPartitions()

2

## Joins

In [85]:
values1 = [('Pirate',1),('Monkey',2),('Ninja',3),('Spaghetti',4)]
df1 = spark.createDataFrame(values1,['name','id'])
df1.show()

+---------+---+
|     name| id|
+---------+---+
|   Pirate|  1|
|   Monkey|  2|
|    Ninja|  3|
|Spaghetti|  4|
+---------+---+



In [86]:
values2 = [('Rutabaga',1),('Pirate',2),('Ninja',3),('Darth Vader',4)]
df2 = spark.createDataFrame(values2,['name','id'])
df2.show()

+-----------+---+
|       name| id|
+-----------+---+
|   Rutabaga|  1|
|     Pirate|  2|
|      Ninja|  3|
|Darth Vader|  4|
+-----------+---+



In [88]:
df1.join(df2, df1.id == df2.id).show()

+---------+---+-----------+---+
|     name| id|       name| id|
+---------+---+-----------+---+
|   Pirate|  1|   Rutabaga|  1|
|    Ninja|  3|      Ninja|  3|
|   Monkey|  2|     Pirate|  2|
|Spaghetti|  4|Darth Vader|  4|
+---------+---+-----------+---+



In [89]:
df1.join(df2, df1.id == df2.id, how='left').show()

+---------+---+-----------+---+
|     name| id|       name| id|
+---------+---+-----------+---+
|   Pirate|  1|   Rutabaga|  1|
|    Ninja|  3|      Ninja|  3|
|   Monkey|  2|     Pirate|  2|
|Spaghetti|  4|Darth Vader|  4|
+---------+---+-----------+---+



## Unions

In [90]:
values1 = [('Pirate',1),('Monkey',2),('Ninja',3),('Spaghetti',4)]
df1 = spark.createDataFrame(values1,['name','id'])
df1.show()

+---------+---+
|     name| id|
+---------+---+
|   Pirate|  1|
|   Monkey|  2|
|    Ninja|  3|
|Spaghetti|  4|
+---------+---+



In [93]:
values2 = [('Rutabaga',5),('Pirate',6),('Ninja',7),('Darth Vader',8)]
df2 = spark.createDataFrame(values2,['name','id'])
df2.show()

+-----------+---+
|       name| id|
+-----------+---+
|   Rutabaga|  5|
|     Pirate|  6|
|      Ninja|  7|
|Darth Vader|  8|
+-----------+---+



In [94]:
df1.union(df2).show()

+-----------+---+
|       name| id|
+-----------+---+
|     Pirate|  1|
|     Monkey|  2|
|      Ninja|  3|
|  Spaghetti|  4|
|   Rutabaga|  5|
|     Pirate|  6|
|      Ninja|  7|
|Darth Vader|  8|
+-----------+---+



## Aggragation

In [109]:
from pyspark.sql.functions import mean, max
df.groupBy('ORIGIN_COUNTRY_NAME').agg(mean('count')).show()

+--------------------+----------+
| ORIGIN_COUNTRY_NAME|avg(count)|
+--------------------+----------+
|              Russia|     156.0|
|            Anguilla|      20.0|
|             Senegal|      46.0|
|              Sweden|      73.0|
|            Kiribati|      18.0|
|              Guyana|      20.0|
|         Philippines|     116.0|
|           Singapore|      25.0|
|            Malaysia|       3.0|
|                Fiji|      51.0|
|              Turkey|      87.0|
|             Germany|    1406.0|
|         Afghanistan|       2.0|
|              Jordan|      51.0|
|               Palau|      30.0|
|Turks and Caicos ...|     147.0|
|              France|     776.0|
|              Greece|      61.0|
|British Virgin Is...|      47.0|
|              Taiwan|     252.0|
+--------------------+----------+
only showing top 20 rows



In [110]:
df.agg(max('count')).show()

+----------+
|max(count)|
+----------+
|    348113|
+----------+



# How to cache and persist

Cache a dataframe when it is used multiple times in the script.

In [115]:
df.cache() # it's lazy, do smth to cache your df
df.is_cached

True

In [None]:
# cache is persist method with save to memory and disk 

In [113]:
df.storageLevel # (DISK, MEMORY, HEAP, DESERIALIZED, ...)
# default StorageLevel.MEMORY_AND_DISK

StorageLevel(True, True, False, True, 1)

In [119]:
# unpersist
df.unpersist()
df.is_cached

False

In [121]:
from pyspark.storagelevel import StorageLevel

df.persist(storageLevel = StorageLevel.DISK_ONLY) # MEMORY_ONLY, MEMORY_AND_DISK
df.is_cached
# df.storageLevel = StorageLevel.MEMORY_AND_DISK

True

# Convert DataFrame to a global or temp View
## Local Temporary View

In [147]:
df.createOrReplaceTempView('local_view')

spark.sql('SELECT * FROM local_view LIMIT 10').show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+



## Global Temporary view
https://spark.apache.org/docs/latest/sql-getting-started.html#global-temporary-view

If you want to have a temporary view that is shared among all sessions and keep alive until the Spark application terminates, you can create a global temporary view.
http://www.balakumarp.net/2019/01/29/pyspark-sharing-spark-context-and-spark-session-across/

In [154]:
df.createOrReplaceGlobalTempView('global_view')

spark.sql('SELECT * FROM global_temp.global_view LIMIT 10').show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+



## Close Session

In [44]:
spark.stop()