In [20]:
import findspark

In [21]:
findspark.init('/home/purvil/spark-2.4.3-bin-hadoop2.7')

In [22]:
from pyspark.sql import SparkSession

In [23]:
spark = SparkSession.builder.appName('intro').getOrCreate()

* Using structured API we can manipulate all kinds of data like csv, parquet files

* Spark has DataFrames, Datasets and SQL tables and views as structured collections of data.
* Internally spark uses engine called catalyst that maintains its own type information through the planning and processing of work. Spark type maps directly to programming language types via lookup table. Each operation is performed on Spark's own type.

### DataFrame vs Dataset
* Spark maintains type information of Data Frame. Checks type match with schema at runtime.
* Dataframe and dataset are distributed table like collections with well defined row and column.
* Schema define column name and types of columns in dataframe.
* Dataset check types at compile time. Only available for JVM based language.

* Create Row

In [24]:
spark.range(2)

DataFrame[id: bigint]

In [25]:
spark.range(2).collect()

[Row(id=0), Row(id=1)]

### Types

In [26]:
from pyspark.sql.types import *

In [27]:
b = ByteType() # 1 byte signed int

In [28]:
s = ShortType() # 2 byte signed int
i = IntegerType()
l = LongType() # 8 byte signed int
f = FloatType() # 4 bytes signed precision foating point
d = DoubleType() # float
d = DecimalType()
s = StringType() # string 
b = BinaryType() # bytearray
b = BooleanType() # bool
t = TimestampType() # datetime.datetime
d = DataType() # datetime.date
a = ArrayType(ShortType()) # list, tuple, array
m = MapType(StringType(), DoubleType()) # dict
# s = StructType() # list or tuple
# s = StructField() # 

### Structured API execution

* Write dataframe/dataset/sql code
* If valid code, spark converts to logical plan
* Spark transform logical plan to physical plan with optimization.
* Spark execute physical plan (RDD manipulation) on the cluster.
![](images/optimizatioon.png)

* Logical plan only represent set of abstract transformation that do not refer to executors or drivers. It just convert user's expression to optimized version.
![](images/logical.png)

* First it convert user code to unresolved logical plan, it is called unresolved because code might be valid but table, dataframe it refers to might not exist. Catalog has information about current table and frames, using that analyzer make resolved logical plan.
* Catalyst optimizer then optimize logical plan by pishing down predicates and selections.

* Spark will generate different physical execution strategies and compare them with cost model.
![](images/physical.png)

* Final result is RDD transformation which is executed on cluster.

* DataFrame has seried of records of type Row. Partition of Dataframe defines physical distribution across the cluster.

In [29]:
df = spark.read.json("spark_data/flight-data/json/2015-summary.json")

In [30]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [31]:
spark.read.json("spark_data/flight-data/json/2015-summary.json").schema

StructType(List(StructField(DEST_COUNTRY_NAME,StringType,true),StructField(ORIGIN_COUNTRY_NAME,StringType,true),StructField(count,LongType,true)))

* StructField have name, type and boolean field designating null is allowed?
* Let's define schema manually

In [32]:
myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello":"world"})
])

In [34]:
df = spark.read.json('spark_data/flight-data/json/2015-summary.json', schema = myManualSchema)

### Manipulate columns

In [39]:
from pyspark.sql.functions import col, column

In [40]:
col("ColName")

Column<b'ColName'>

In [41]:
column("ColName")

Column<b'ColName'>

* To specify specific column of a dataframe

In [43]:
df["count"]

Column<b'count'>

* Expression is set of transformation on one or more values in a record in DataFrame.

In [45]:
from pyspark.sql.functions import expr

In [46]:
expr("colName")

Column<b'colName'>

### expr vs col

In [47]:
expr("colName - 4")

Column<b'(colName - 4)'>

In [48]:
col("colname") - 4

Column<b'(colname - 4)'>

* Above two expression are same

In [49]:
spark.read.json('spark_data/flight-data/json/2015-summary.json').columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

### Records and rows
* Each row in dataframe is single record. Each record has type Row. Row object internally has arrays of bytes.

In [50]:
df.first()

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [51]:
from pyspark.sql import Row

In [52]:
myRow = Row("Hello", None, 1, False) # creating new row

In [53]:
myRow[0]

'Hello'

In [54]:
myRow[-1]

False

### DataFrame Transformation

* Add row, columns
* Remove row, columns
* Transform row in colum (vice versa)
* Change order of rows based on values in columns.

In [55]:
df.createOrReplaceTempView("dfTable")

#### Creating DataFrame

In [57]:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

In [58]:
myManualSchema = StructType([
    StructField("some", StringType(), True),
    StructField("col", StringType(), True),
    StructField("name", LongType(), True)
])

In [59]:
myRow = Row("Hello",None, 1)

In [60]:
myDf = spark.createDataFrame([myRow], schema=myManualSchema)

In [61]:
myDf.show()

+-----+----+----+
| some| col|name|
+-----+----+----+
|Hello|null|   1|
+-----+----+----+



#### select and selectExpr

* Same as

```
SELECT * FROM dataFrameName
SELECT colName FROM dataFrameName
```

In [63]:
df.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [65]:
spark.sql("SELECT DEST_COUNTRY_NAME FROM dfTable LIMIT 2").show()

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+



In [68]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



In [69]:
spark.sql("SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME FROM dfTable LIMIT 2").show()

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+



In [70]:
df.select(expr("DEST_COUNTRY_NAME"), col("DEST_COUNTRY_NAME"), column("DEST_COUNTRY_NAME")).show(2)

+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|
|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows



In [71]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [72]:
spark.sql("SELECT DEST_COUNTRY_NAME AS destination FROM dfTable LIMIT 2").show()

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+



* Compact form of above select expr is

In [73]:
df.selectExpr("DEST_COUNTRY_NAME AS destination").show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [76]:
df.select(expr("DEST_COUNTRY_NAME").alias("destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [77]:
df.selectExpr("*", "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry").show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [81]:
spark.sql("SELECT *,(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry FROM dfTable LIMIT 2").show()

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+



* Within selectExpr we can define aggregation over entire dataframe

In [83]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show()

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



In [87]:
spark.sql("SELECT avg(count), count(DISTINCT(DEST_COUNTRY_NAME)) FROM dfTable").show()

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



### Literals
* To pass pure value, which translate given programming language literal to spark type.

In [88]:
from pyspark.sql.functions import lit

In [90]:
df.select(expr('*'), lit(1).alias("One")).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|One|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [166]:
df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [91]:
spark.sql("SELECT *, 1 FROM dfTable LIMIT 2").show()

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|  1|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+



In [93]:
df.withColumn("numberOne", lit(1)).show(2) # Adding column numberOne

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [94]:
spark.sql("SELECT *, 1 AS numberOne FROM dfTable LIMIT 2").show()

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+



In [96]:
df.withColumn("withinCountry", expr("DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME")).show(5)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
|    United States|            Ireland|  344|        false|
|            Egypt|      United States|   15|        false|
|    United States|              India|   62|        false|
+-----------------+-------------------+-----+-------------+
only showing top 5 rows



* withColumn take 2 argument column name and expression that create value for new columns.

### Rename column

In [98]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

['dest', 'ORIGIN_COUNTRY_NAME', 'count']

### Reserved character and keywords

In [99]:
dfWithLongName = df.withColumn("This Long Column_name", expr("ORIGIN_COUNTRY_NAME"))

In [100]:
dfWithLongName.show(2)

+-----------------+-------------------+-----+---------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|This Long Column_name|
+-----------------+-------------------+-----+---------------------+
|    United States|            Romania|   15|              Romania|
|    United States|            Croatia|    1|              Croatia|
+-----------------+-------------------+-----+---------------------+
only showing top 2 rows



* To access such column we need ```

In [103]:
dfWithLongName.selectExpr("`This Long Column_name`").show(2)

+---------------------+
|This Long Column_name|
+---------------------+
|              Romania|
|              Croatia|
+---------------------+
only showing top 2 rows



In [105]:
dfWithLongName.createOrReplaceTempView("dfTableLong")

In [106]:
spark.sql("SELECT `This Long Column_name` FROM dfTableLong Limit 2").show()

+---------------------+
|This Long Column_name|
+---------------------+
|              Romania|
|              Croatia|
+---------------------+



* By default spark is case intensitive
* To make it sensitive

```
set spark.sql.caseSensitive true
```

### Removing column

In [107]:
df.drop("ORIGIN_COUNTRY_NAME").columns

['DEST_COUNTRY_NAME', 'count']

In [111]:
dfWithLongName.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").columns

['count', 'This Long Column_name']

### Type casting

In [112]:
df.withColumn("count2", col("count").cast("long"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: bigint]

In [114]:
spark.sql("SELECT *, cast(count AS long) AS count2 FROM dfTable")

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: bigint]

### Filtering Rows

In [115]:
df.filter(col("count") < 2).show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
|            Malta|      United States|    1|
|    United States|          Gibraltar|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [116]:
df.where("count < 2").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [118]:
spark.sql('SELECT * FROM dfTable WHERE count < 2 LIMIT 2').show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+



In [119]:
df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [120]:
spark.sql('SELECT * FROM dfTable WHERE count < 2 AND ORIGIN_COUNTRY_NAME != "Croatia" LIMIT 2').show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+



### Getting unique Rows

In [121]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

256

In [122]:
spark.sql("SELECT count(distinct(ORIGIN_COUNTRY_NAME, DEST_COUNTRY_NAME)) FROM dfTable").show()

+------------------------------------------------------------------------------------------------------------+
|count(DISTINCT named_struct(ORIGIN_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, DEST_COUNTRY_NAME, DEST_COUNTRY_NAME))|
+------------------------------------------------------------------------------------------------------------+
|                                                                                                         256|
+------------------------------------------------------------------------------------------------------------+



### Random Sample

In [123]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

126

### Random Split

* Breakup dataframe in random split of original dataframe.
* Useful in ML for creating training, validation, test sets.

In [124]:
dataFrames = df.randomSplit([0.25, 0.75], seed)

In [125]:
dataFrames[0].count()

60

In [126]:
dataFrames[1].count()

196

### Concatenate and appending the Rows

* Union are performed based on location, not on schema

In [127]:
schema = df.schema

In [129]:
newRows = [
    Row("New Country", "Other country", 5),
    Row("New Country 2", "Other country 3", 1)
]

In [130]:
parallelizedRows = spark.sparkContext.parallelize(newRows)

In [131]:
newDf = spark.createDataFrame(parallelizedRows, schema)

In [133]:
df.union(newDf)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [137]:
df.union(newDf).where("count = 1").where(col("ORIGIN_COUNTRY_NAME") != "United States").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other country 3|    1|
+-----------------+-------------------+-----+



### Sorting Rows

In [138]:
df.sort("count").show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [139]:
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [140]:
from pyspark.sql.functions import desc, asc

In [141]:
df.orderBy(col("count").desc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
+-----------------+-------------------+------+
only showing top 5 rows



In [145]:
spark.sql("SELECT * FROM dfTable ORDER BY count DESC LIMIT 5").show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
+-----------------+-------------------+------+



* To allow null values in sorted dataframe use `asc_nulls_first`, `desc_nulls_first`, `asc_nulls_last` or `desc_nulls_last`

* For optimization sort withing each partition

In [148]:
spark.read.json("spark_data/flight-data/json/2015-summary.json").sortWithinPartitions("count")

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

### Limit

In [149]:
df.limit(5).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+



In [152]:
df.orderBy(col("count").desc()).limit(6).show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Mexico|  7187|
|           Mexico|      United States|  7140|
|   United Kingdom|      United States|  2025|
+-----------------+-------------------+------+



### Repartition and Coalesce

* Partition data with frequently filtered column, which control physical layout of data across cluster.

In [153]:
df.repartition(5)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [154]:
df.rdd.getNumPartitions()

1

In [157]:
df.repartition(col("DEST_COUNTRY_NAME"))

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

* Coalesce is combining partition

In [159]:
df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

### collect

* Spark maintain state of cluster in driver.
* If we want to collect some data to the driver, collect gets entire dataframe

In [160]:
collectDf = df.limit(10)

In [161]:
collectDf.take(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [162]:
collectDf.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+



In [163]:
collectDf.show(5, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India              |62   |
+-----------------+-------------------+-----+
only showing top 5 rows



In [164]:
collectDf.collect()

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count=62),
 Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count=588),
 Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count=40),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

### toLocalIterator

* Collects paritions to the driver as iterator and allows to iterate over the entire dataset partition by partition in serial manner

In [165]:
collectDf.toLocalIterator()

<itertools.chain at 0x7fb02125bba8>

In [168]:
df = spark.read.csv("spark_data/retail-data/by-day/2010-12-01.csv", header=True, inferSchema = True)

In [169]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [170]:
df.createOrReplaceTempView('dfTable')

In [175]:
df.where(col("InvoiceNo") == 536365).select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [177]:
df.where("InvoiceNo = 536365").show(5, False)

+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 08:26:00|2.75     |17850.0   |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
+---------+-----

In [178]:
from pyspark.sql.functions import instr

In [None]:
priceFilter = col('UnitPrice') > 600
