# almond-spark

* Based [ammonite-spark](https://github.com/alexarchambault/ammonite-spark)
* Works for any Spark version >= 2.0
* Currently support for local, standalone and yarn clusters

In [8]:
import $ivy.`org.apache.spark::spark-sql:2.4.3` // Or use any other 2.x version here
import $ivy.`sh.almond::almond-spark:0.5.0`

import org.apache.spark.sql._, org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)

val spark = {
  NotebookSparkSession.builder()
    .master("local[*]")
    .getOrCreate()
}
def sc = spark.sparkContext

Getting spark JARs
Creating SparkSession


[32mimport [39m[36m$ivy.$                                   // Or use any other 2.x version here
[39m
[32mimport [39m[36m$ivy.$                              

[39m
[32mimport [39m[36morg.apache.spark.sql._, org.apache.log4j.{Level, Logger}
[39m
[36mspark[39m: [32mSparkSession[39m = org.apache.spark.sql.SparkSession@2c757b23
defined [32mfunction[39m [36msc[39m

## Load a DataFrame

In [9]:
import spark.implicits._
val titanic = spark
  .read
  .format("csv")
  .option("inferSchema", "true")
  .option("header", "true")
  .load("titanic.csv")

[32mimport [39m[36mspark.implicits._
[39m
[36mtitanic[39m: [32mDataFrame[39m = [Survived: int, Pclass: int ... 6 more fields]

## Show as text

In [10]:
titanic.show()

+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+
|Survived|Pclass|                Name|   Sex| Age|Siblings/Spouses Aboard|Parents/Children Aboard|   Fare|
+--------+------+--------------------+------+----+-----------------------+-----------------------+-------+
|       0|     3|Mr. Owen Harris B...|  male|22.0|                      1|                      0|   7.25|
|       1|     1|Mrs. John Bradley...|female|38.0|                      1|                      0|71.2833|
|       1|     3|Miss. Laina Heikk...|female|26.0|                      0|                      0|  7.925|
|       1|     1|Mrs. Jacques Heat...|female|35.0|                      1|                      0|   53.1|
|       0|     3|Mr. William Henry...|  male|35.0|                      0|                      0|   8.05|
|       0|     3|     Mr. James Moran|  male|27.0|                      0|                      0| 8.4583|
|       0|     1|Mr. Timothy J McC...

## Let's make the output a bit nicer

In [11]:
implicit class RichDF(val df: DataFrame) {
  def showHTML(limit:Int = 20) = {
    import xml.Utility.escape
    val data = df.take(limit)
    val header = df.schema.fieldNames.toSeq
    val rows: Seq[Seq[String]] = data.map { row =>
      row.toSeq.map { cell =>
        cell match {
          case null => "null"
          case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
          case array: Array[_] => array.mkString("[", ", ", "]")
          case seq: Seq[_] => seq.mkString("[", ", ", "]")
          case _ => cell.toString
        }
      }: Seq[String]
    }

    publish.html(s"""
    <div>
      <table border="1" class="dataframe">
      <thead>
        <tr>
        ${header.map(h => s"<th>${escape(h)}</th>").mkString}
        </tr>
         </thead>
         <tbody>
        ${rows.map { row =>
          s"<tr>${row.map { c => s"<td>${escape(c)}</td>" }.mkString}</tr>"
        }.mkString
        }
        </tbody>
      </table>
    </div>""")
  }
}

defined [32mclass[39m [36mRichDF[39m

In [12]:
titanic.showHTML(8)

Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,Mrs. John Bradley (Florence Briggs Thayer) Cumings,female,38.0,1,0,71.2833
1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
0,3,Mr. William Henry Allen,male,35.0,0,0,8.05
0,3,Mr. James Moran,male,27.0,0,0,8.4583
0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.8625
0,3,Master. Gosta Leonard Palsson,male,2.0,3,1,21.075


# Let's try some visualization

In [13]:
import $ivy.`org.vegas-viz::vegas-spark:0.3.11`
import vegas._, vegas.data.External._, vegas.sparkExt._

Vegas("Titanic Survivors").
  withDataFrame(titanic).
  mark(Bar).
  encodeY("*", aggregate=AggOps.Count, axis=Axis(title="Number of People", grid=false)).
  encodeColumn("Pclass", Ord, scale=Scale(padding=10.0), axis=Axis(orient=Orient.Bottom, axisWidth=1.0, offset= -8.0)).
  encodeX("Survived", Nominal, scale=Scale(bandSize = 16.0), hideAxis=true).
  encodeColor("Survived", Nominal, scale=Scale(rangeNominals=List("red", "green"))).
  configFacet(cell=CellConfig(strokeWidth = 0)).
  configCell(height=400).
  show

[32mimport [39m[36m$ivy.$                                  
[39m
[32mimport [39m[36mvegas._, vegas.data.External._, vegas.sparkExt._

[39m