In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.count
// Build a SparkSession using the SparkSession APIs.
// If one does not exist, then create an instance. There
// can only be one SparkSession per JVM.
val spark = SparkSession
 .builder
 .appName("ScalaMnMCount")
 .getOrCreate()
// Get the M&M data set

Intitializing Scala interpreter ...

Spark Web UI available at http://L2108017.bosonit.local:4041
SparkContext available as 'sc' (version = 3.0.3, master = local[*], app id = local-1634196002505)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.count
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@368960df


In [2]:

    // get the M&M data set file name
    val mnmFile = "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/chapter2/py/src/data/mnm_dataset.csv"
    // read the file into a Spark DataFrame
    val mnmDF = spark.read.format("csv")
      .option("header", "true")
      .option("inferSchema", "true")
      .load(mnmFile)
    // display DataFrame
    mnmDF.show(5, false)


+-----+------+-----+
|State|Color |Count|
+-----+------+-----+
|TX   |Red   |20   |
|NV   |Blue  |66   |
|CO   |Blue  |79   |
|OR   |Blue  |71   |
|WA   |Yellow|93   |
+-----+------+-----+
only showing top 5 rows



mnmFile: String = C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/chapter2/py/src/data/mnm_dataset.csv
mnmDF: org.apache.spark.sql.DataFrame = [State: string, Color: string ... 1 more field]


In [3]:
// Crear TempView
mnmDF.createOrReplaceTempView("mnmDFView")

In [4]:
    // aggregate count of all colors and groupBy state and color
    // orderBy descending order

    val countMnMDF = mnmDF
     .select("State", "Color", "Count")
     .groupBy("State", "Color")
     .agg(count("Count").alias("Total"))
     .orderBy(desc("Total"))

    // show all the resulting aggregation for all the dates and colors
    countMnMDF.show(60)
    println(s"Total Rows = ${countMnMDF.count()}")
    println()

+-----+------+-----+
|State| Color|Total|
+-----+------+-----+
|   CA|Yellow| 1807|
|   WA| Green| 1779|
|   OR|Orange| 1743|
|   TX| Green| 1737|
|   TX|   Red| 1725|
|   CA| Green| 1723|
|   CO|Yellow| 1721|
|   CA| Brown| 1718|
|   CO| Green| 1713|
|   NV|Orange| 1712|
|   TX|Yellow| 1703|
|   NV| Green| 1698|
|   AZ| Brown| 1698|
|   CO|  Blue| 1695|
|   WY| Green| 1695|
|   NM|   Red| 1690|
|   AZ|Orange| 1689|
|   NM|Yellow| 1688|
|   NM| Brown| 1687|
|   UT|Orange| 1684|
|   NM| Green| 1682|
|   UT|   Red| 1680|
|   AZ| Green| 1676|
|   NV|Yellow| 1675|
|   NV|  Blue| 1673|
|   WA|   Red| 1671|
|   WY|   Red| 1670|
|   WA| Brown| 1669|
|   NM|Orange| 1665|
|   WY|  Blue| 1664|
|   WA|Yellow| 1663|
|   WA|Orange| 1658|
|   CA|Orange| 1657|
|   NV| Brown| 1657|
|   CA|   Red| 1656|
|   CO| Brown| 1656|
|   UT|  Blue| 1655|
|   AZ|Yellow| 1654|
|   TX|Orange| 1652|
|   AZ|   Red| 1648|
|   OR|  Blue| 1646|
|   UT|Yellow| 1645|
|   OR|   Red| 1645|
|   CO|Orange| 1642|
|   TX| Brown

countMnMDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [State: string, Color: string ... 1 more field]


In [5]:
// Haciendo lo mismo usando tempViews y Spark SQL
spark.sql("""SELECT State, Color, count(*) as Total
             FROM mnmDFView
             GROUP BY State, Color
             ORDER BY Total desc""").show()

+-----+------+-----+
|State| Color|Total|
+-----+------+-----+
|   CA|Yellow| 1807|
|   WA| Green| 1779|
|   OR|Orange| 1743|
|   TX| Green| 1737|
|   TX|   Red| 1725|
|   CA| Green| 1723|
|   CO|Yellow| 1721|
|   CA| Brown| 1718|
|   CO| Green| 1713|
|   NV|Orange| 1712|
|   TX|Yellow| 1703|
|   NV| Green| 1698|
|   AZ| Brown| 1698|
|   WY| Green| 1695|
|   CO|  Blue| 1695|
|   NM|   Red| 1690|
|   AZ|Orange| 1689|
|   NM|Yellow| 1688|
|   NM| Brown| 1687|
|   UT|Orange| 1684|
+-----+------+-----+
only showing top 20 rows



In [6]:
val sumMnMDF = mnmDF.select("State", "Color", "Count")
        .groupBy("State", "Color")
        .sum("Count")
        .orderBy(desc("sum(Count)"))

sumMnMDF.show(10,false)

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|CA   |Yellow|100956    |
|WA   |Green |96486     |
|CA   |Brown |95762     |
|TX   |Green |95753     |
|TX   |Red   |95404     |
|CO   |Yellow|95038     |
|NM   |Red   |94699     |
|OR   |Orange|94514     |
|WY   |Green |94339     |
|NV   |Orange|93929     |
+-----+------+----------+
only showing top 10 rows



sumMnMDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [State: string, Color: string ... 1 more field]


In [8]:
// Haciendo lo mismo usando tempViews y Spark SQL
spark.sql("""SELECT State, Color, sum(Count) as Suma
             FROM mnmDFView
             GROUP BY State, Color
             ORDER BY sum(Count) desc""").show(10,false)

+-----+------+------+
|State|Color |Suma  |
+-----+------+------+
|CA   |Yellow|100956|
|WA   |Green |96486 |
|CA   |Brown |95762 |
|TX   |Green |95753 |
|TX   |Red   |95404 |
|CO   |Yellow|95038 |
|NM   |Red   |94699 |
|OR   |Orange|94514 |
|WY   |Green |94339 |
|NV   |Orange|93929 |
+-----+------+------+
only showing top 10 rows



In [27]:
    // find the aggregate count for California by filtering

val caCountMnNDF = mnmDF
 .select("State", "Color", "Count")
 .where($"State" === "CA")
 .groupBy("State", "Color")
 .agg(count("Count").alias("Total"))
 .orderBy(desc("Total"))

    // show the resulting aggregation for California
    caCountMnNDF.show(10)

+-----+------+-----+
|State| Color|Total|
+-----+------+-----+
|   CA|Yellow| 1807|
|   CA| Green| 1723|
|   CA| Brown| 1718|
|   CA|Orange| 1657|
|   CA|   Red| 1656|
|   CA|  Blue| 1603|
+-----+------+-----+



caCountMnNDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [State: string, Color: string ... 1 more field]


In [9]:
// Haciendo lo mismo usando tempViews y Spark SQL
spark.sql("""SELECT State, Color, count(*) as Total
             FROM mnmDFView
             WHERE State== 'CA'
             GROUP BY State, Color
             ORDER BY Total desc""").show(10)

+-----+------+-----+
|State| Color|Total|
+-----+------+-----+
|   CA|Yellow| 1807|
|   CA| Green| 1723|
|   CA| Brown| 1718|
|   CA|Orange| 1657|
|   CA|   Red| 1656|
|   CA|  Blue| 1603|
+-----+------+-----+



In [10]:
    val caSumMnNDF = mnmDF.select("*")
      .where(col("State") === "CA")
      .groupBy("State", "Color")
      .sum("Count")
      .orderBy(desc("sum(Count)"))

caSumMnNDF.show(10,false)

+-----+------+----------+
|State|Color |sum(Count)|
+-----+------+----------+
|CA   |Yellow|100956    |
|CA   |Brown |95762     |
|CA   |Green |93505     |
|CA   |Red   |91527     |
|CA   |Orange|90311     |
|CA   |Blue  |89123     |
+-----+------+----------+



caSumMnNDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [State: string, Color: string ... 1 more field]


In [12]:
// Haciendo lo mismo usando tempViews y Spark SQL
spark.sql("""SELECT State, Color, sum(Count) as Suma
             FROM mnmDFView
             WHERE State == 'CA'
             GROUP BY State, Color
             ORDER BY Suma desc""").show(10, false)

+-----+------+------+
|State|Color |Suma  |
+-----+------+------+
|CA   |Yellow|100956|
|CA   |Brown |95762 |
|CA   |Green |93505 |
|CA   |Red   |91527 |
|CA   |Orange|90311 |
|CA   |Blue  |89123 |
+-----+------+------+



In [16]:
     val caCountMnNDF = mnmDF.select(max("Count") as "Max",min("Count") as "Min",avg("Count") as "Avg",count("*") as "Cuenta")

    caCountMnNDF.show(10)

+---+---+-----------------+------+
|Max|Min|              Avg|Cuenta|
+---+---+-----------------+------+
|100| 10|55.00090000900009| 99999|
+---+---+-----------------+------+



caCountMnNDF: org.apache.spark.sql.DataFrame = [Max: int, Min: int ... 2 more fields]


In [13]:
// Haciendo lo mismo usando tempViews y Spark SQL
spark.sql("""SELECT max(Count) as Max, min(Count) as Min, avg(Count) as Avg, count(*) as Cuenta
             FROM mnmDFView""").show(10)

+---+---+-----------------+------+
|Max|Min|              Avg|Cuenta|
+---+---+-----------------+------+
|100| 10|55.00090000900009| 99999|
+---+---+-----------------+------+



In [30]:
val quijote="C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/el_quijote.txt"
val qj_df = spark.read.format("text")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(quijote)
qj_df.show(5,false)

+----------------------------------------------------------------------------------------------+
|value                                                                                         |
+----------------------------------------------------------------------------------------------+
|DON QUIJOTE DE LA MANCHA                                                                      |
|Miguel de Cervantes Saavedra                                                                  |
|                                                                                              |
|PRIMERA PARTE                                                                                 |
|CAPI?TULO 1: Que trata de la condicio?n y ejercicio del famoso hidalgo D. Quijote de la Mancha|
+----------------------------------------------------------------------------------------------+
only showing top 5 rows



quijote: String = C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/el_quijote.txt
qj_df: org.apache.spark.sql.DataFrame = [value: string]


In [8]:
qj_df.show()

+--------------------+
|               value|
+--------------------+
|DON QUIJOTE DE LA...|
|Miguel de Cervant...|
|                    |
|       PRIMERA PARTE|
|CAPI?TULO 1: Que ...|
|En un lugar de la...|
|Tuvo muchas veces...|
|En resolucio?n, e...|
|historia ma?s cie...|
|Deci?a e?l, que e...|
|En efecto, remata...|
|Imagina?base el p...|
|linaje y patria, ...|
|Limpias, pues, su...|
|Capi?tulo 2: Que ...|
|Hechas, pues, est...|
|Estos pensamiento...|
|Con estos iba ens...|
|Autores hay que d...|
|muertos de hambre...|
+--------------------+
only showing top 20 rows



In [31]:
qj_df.show(5)

+--------------------+
|               value|
+--------------------+
|DON QUIJOTE DE LA...|
|Miguel de Cervant...|
|                    |
|       PRIMERA PARTE|
|CAPI?TULO 1: Que ...|
+--------------------+
only showing top 5 rows



In [4]:
qj_df.count()

res3: Long = 2186


In [5]:
qj_df.first()

// Devuelve la primera línea

res4: org.apache.spark.sql.Row = [DON QUIJOTE DE LA MANCHA]


In [6]:
qj_df.head()

// Devuelve la cabecera

res5: org.apache.spark.sql.Row = [DON QUIJOTE DE LA MANCHA]


In [7]:
qj_df.take(5)

// Devuelve las 5 primeras líneas

res6: Array[org.apache.spark.sql.Row] = Array([DON QUIJOTE DE LA MANCHA], [Miguel de Cervantes Saavedra], [], [PRIMERA PARTE], [CAPI?TULO 1: Que trata de la condicio?n y ejercicio del famoso hidalgo D. Quijote de la Mancha])
