# Datasets

Examples taken from [Spark: The definitive Guide](https://github.com/databricks/Spark-The-Definitive-Guide)

## Imports

In [4]:
// Imports

import org.apache.spark.sql.{functions => F}

import org.apache.spark.sql.{functions=>F}


## Load Data

In [7]:
// Load data

val df = spark.read.option("header", true)
    .option("inferSchema", true)
    .option("sep", ",")
    .csv("../data/flights.csv")

df = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [15]:
case class Flight(DEST_COUNTRY_NAME: String, 
                  ORIGIN_COUNTRY_NAME: String,
                  count: Integer)

defined class Flight


In [16]:
// Convert data to DataSet

val ds = df.as[Flight]

ds = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [18]:
// Show data

ds.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [26]:
case class FlightMetadata(count: BigInt, randomData: BigInt)

defined class FlightMetadata


In [27]:
// create some additional data 

val flightsMeta = spark.range(500).map(x => (x, scala.util.Random.nextLong))
.withColumnRenamed("_1", "count").withColumnRenamed("_2", "randomData")
.as[FlightMetadata]

flightsMeta = [count: bigint, randomData: bigint]


[count: bigint, randomData: bigint]

In [28]:
flightsMeta.show(5)

+-----+--------------------+
|count|          randomData|
+-----+--------------------+
|    0|-6031716416570896056|
|    1|-5967086671531276217|
|    2| 8598635527670955303|
|    3| -267478557297359822|
|    4| 2260447296336899327|
+-----+--------------------+
only showing top 5 rows



## Operations in DataSets

In [21]:
// filter
// filter out all the flights which origin country is not United States

ds.filter(_.ORIGIN_COUNTRY_NAME == "United States").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|              Guyana|      United States|   17|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|Saint Vincent and...|      United States|    1|
|               Italy|      United States|  390|
|            Pakistan|      United States|    9|
|             Iceland|      United States|  118|
|    Marshall Islands|      United States|   77|
|          Luxembourg|      United States|   91|
|            Honduras|      United States|  391|
|         The Bahamas|      United States|  903|
|         El Salvado

In [25]:
// Map

ds.map(_.count).show()

+-----+
|value|
+-----+
|    1|
|  264|
|   69|
|   24|
|    1|
|   25|
|   54|
|  477|
|   29|
|   44|
|   17|
|   53|
|    1|
|   46|
|   21|
|  136|
|    2|
|    1|
|  390|
|  156|
+-----+
only showing top 20 rows



In [29]:
// Join

ds.joinWith(flightsMeta, ds.col("count") === flightsMeta.col("count")).show()

+--------------------+--------------------+
|                  _1|                  _2|
+--------------------+--------------------+
|[United States, R...|[1, -596903653854...|
|[United States, I...|[264, 64482154797...|
|[United States, I...|[69, -22975802328...|
|[Egypt, United St...|[24, -43955991332...|
|[Equatorial Guine...|[1, -596903653854...|
|[United States, S...|[25, 357239105324...|
|[United States, G...|[54, 191869303802...|
|[Costa Rica, Unit...|[477, -6337887343...|
|[Senegal, United ...|[29, 164459472064...|
|[United States, M...|[44, -19393253963...|
|[Guyana, United S...|[17, 852759647551...|
|[United States, S...|[53, -71976291324...|
|[Malta, United St...|[1, -596903653854...|
|[Bolivia, United ...|[46, 264415698752...|
|[Anguilla, United...|[21, -42424416565...|
|[Turks and Caicos...|[136, -7912373273...|
|[United States, A...|[2, 2939696453340...|
|[Saint Vincent an...|[1, -596903653854...|
|[Italy, United St...|[390, -8965538714...|
|[United States, R...|[156, -167

In [31]:
// GroupBy

ds.groupBy("ORIGIN_COUNTRY_NAME").count().show()

+--------------------+-----+
| ORIGIN_COUNTRY_NAME|count|
+--------------------+-----+
|              Russia|    1|
|            Anguilla|    1|
|             Senegal|    1|
|              Sweden|    1|
|            Kiribati|    1|
|              Guyana|    1|
|         Philippines|    1|
|           Singapore|    1|
|            Malaysia|    1|
|                Fiji|    1|
|              Turkey|    1|
|             Germany|    1|
|         Afghanistan|    1|
|              Jordan|    1|
|               Palau|    1|
|Turks and Caicos ...|    1|
|              France|    1|
|              Greece|    1|
|British Virgin Is...|    1|
|              Taiwan|    1|
+--------------------+-----+
only showing top 20 rows



In [32]:
// groupByKey

ds.groupByKey(x => x.ORIGIN_COUNTRY_NAME).count().show()

+--------------------+--------+
|               value|count(1)|
+--------------------+--------+
|              Russia|       1|
|            Anguilla|       1|
|             Senegal|       1|
|              Sweden|       1|
|            Kiribati|       1|
|              Guyana|       1|
|         Philippines|       1|
|           Singapore|       1|
|            Malaysia|       1|
|                Fiji|       1|
|              Turkey|       1|
|             Germany|       1|
|         Afghanistan|       1|
|              Jordan|       1|
|               Palau|       1|
|Turks and Caicos ...|       1|
|              France|       1|
|              Greece|       1|
|British Virgin Is...|       1|
|              Taiwan|       1|
+--------------------+--------+
only showing top 20 rows

