# Basic Structured Operations

## Schemas

In [0]:
// Create a Dataframe
val df = spark.read
    .format("json")
    .load("/datasets/flight-data/json/2015-summary.json")

// Print schema
df.printSchema()

In [0]:
// Analyzing schema in Dataframe
spark.read
    .format("json")
    .load("/datasets/flight-data/json/2015-summary.json")
    .schema

In [0]:
// Exploring Spark's complex types
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
import org.apache.spark.sql.types.Metadata

val myManualSchema = StructType(Array(
    StructField("DEST_COUNTRY_NAME", StringType, true),
    StructField("ORIGIN_COUNTRY_NAME", StringType, true),
    StructField("count", LongType, false, Metadata.fromJson("{\"hello\":\"world\"}"))
))

val df = spark.read
    .format("json")
    .schema(myManualSchema)
    .load("/datasets/flight-data/json/2015-summary.json")

## Columns

In [0]:
import org.apache.spark.sql.functions.{col, column}
col("someColumnName")
column("someColumnName")

### Columns as expressions

In [0]:
import org.apache.spark.sql.functions.expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

## Creating Rows

In [0]:
import org.apache.spark.sql.Row
val myRow = Row("Hello", null, 1, false)

myRow(0) // type Any
myRow(0).asInstanceOf[String] // String
myRow.getString(0) // String
myRow.getInt(2) // Int

## Creating DataFrames

In [0]:
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}

In [0]:
val df = spark.read
    .format("json")
    .load("/datasets/flight-data/json/2015-summary.json")

df.createOrReplaceGlobalTempView("dfTable")

In [0]:
val myManualSchema = new StructType(Array(
    new StructField("some", StringType, true),
    new StructField("col", StringType, true),
    new StructField("names", LongType, false)
))

val myRows = Seq(Row("Hello", null, 1L),Row("Amaznf", "sooper", 1L))
val myRDD = spark.sparkContext.parallelize(myRows)
val myDF = spark.createDataFrame(myRDD, myManualSchema)

myDF.show()

In [0]:
val myDF = Seq(("Hello", 2, 1L)).toDF("col1", "col2", "col3")

myDF.show()

## select and selectExpr

In [0]:
df.select("DEST_COUNTRY_NAME").show(5)

In [0]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(5)

In [0]:
import org.apache.spark.sql.functions.{expr, col, column}
df.select(
    df.col("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME"),
    'DEST_COUNTRY_NAME,
    $"DEST_COUNTRY_NAME",
    expr("DEST_COUNTRY_NAME")
).show(5)

In [0]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(5)

In [0]:
df.select(expr("DEST_COUNTRY_NAME AS destination").alias("DEST_COUNTRY_NAME")).show(5)

In [0]:
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(5)

In [0]:
df.selectExpr(
    "*", // include all origin columns
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS withinCountry"
).show(5)

In [0]:
df.selectExpr(
    "AVG(count)",
    "COUNT(DISTINCT(DEST_COUNTRY_NAME))"
).show(5)

## Converting to Spark Types

In [0]:
import org.apache.spark.sql.functions.lit

df.select(expr("*"), lit(1).as("One")).show(5)

## Adding Columns

In [0]:
df.withColumn("numberOne", lit(1)).show(5)

In [0]:
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")).show(5)

## Renaming Columns

In [0]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

## Reserved Characters and Keywords

In [0]:
import org.apache.spark.sql.functions.expr

val dfWithLongColName = df.withColumn(
    "This Long Column-Name",
    expr("ORIGIN_COUNTRY_NAME")
)

dfWithLongColName.selectExpr(
    "`This Long Column-Name`",
    "`This Long Column-Name` as `new col`"
).show(5)

In [0]:
dfWithLongColName.select(col("This Long Column-Name")).columns

## Removing Columns

In [0]:
df.drop("ORIGIN_COUNTRY_NAME").columns

dfWithLongColName.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME")

## Changing a Column's Type

In [0]:
df.withColumn("count2", col("count").cast("long"))

## Filtering Rows

In [0]:
df.filter(col("count") < 2).show(5)

In [0]:
df.where("count < 2").show(5)

In [0]:
df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") =!= "Croatia").show(5)

## Getting Unique Rows

In [0]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

In [0]:
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

## Random Samples

In [0]:
val seed = 5
val withReplacement = false
val fraction = 0.5

df.sample(withReplacement, fraction, seed).count()

## Random Splits

In [0]:
val dataFrames = df.randomSplit(Array(0.25,0.75), seed)
dataFrames(0).count() > dataFrames(1).count() // False

## Concatenating and Appending Rows (Union)

In [0]:
import org.apache.spark.sql.Row

val schema = df.schema
val newRows = Seq(
    Row("New Country 1", "Other Country 1", 5L),
    Row("New Country 2", "Other Country 3", 1L)
)
val parallelizedRows = spark.sparkContext.parallelize(newRows)
val newDF = spark.createDataFrame(parallelizedRows, schema)

df.union(newDF)
    .where("count = 1")
    .where($"ORIGIN_COUNTRY_NAME" =!= "United States")
    .show()

## Sorting Rows

In [0]:
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

In [0]:
import org.apache.spark.sql.functions.{desc, asc}

df.orderBy(expr("count desc")).show(5)
df.orderBy(desc("count"), asc("DEST_COUNTRY_NAME")).show(5)

In [0]:
spark.read
    .format("json")
    .load("/datasets/flight-data/json/*-summary.json")
    .sortWithinPartitions("count")

## Limit

In [0]:
df.limit(5).show()

In [0]:
df.orderBy(expr("count desc")).limit(5).show()

## Repartition and Coalesce

In [0]:
//df.rdd.getNumPartitions

df.repartition(5)

df.repartition(col("DEST_COUNTRY_NAME"))

df.repartition(5, col("DEST_COUNTRY_NAME"))

In [0]:
df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

## Collecting Rows to the Driver

In [0]:
val collectDF = df.limit(10)
collectDF.take(5) // take works with an Integer count
collectDF.show() // this prints it out nicely
collectDF.show(5, false)
collectDF.collect()

# End