## Ejemplo M&Ms

#### Realizamos los imports y definimos el fichero .csv

In [0]:
from pyspark.sql.functions import *

file = "/FileStore/shared_uploads/giovanni.rodriguez@bosonit.com/mnm_dataset.csv"

#### Leemos del fichero e inferimos el esquema

In [0]:
mnmDf = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(file)
mnmDf.persist()

Out[2]: DataFrame[State: string, Color: string, Count: int]

#### Contamos el valor de los colores agrupándolos por estado y color, para luego ordenarlos de forma descendente

In [0]:
countMnmDf = mnmDf.select("State", "Color", "Count").groupBy("State", "Color").agg(count("Count").alias("Total")).orderBy(desc("Total"))
countMnmDf.show()

+-----+------+-----+
|State| Color|Total|
+-----+------+-----+
|   CA|Yellow| 1807|
|   WA| Green| 1779|
|   OR|Orange| 1743|
|   TX| Green| 1737|
|   TX|   Red| 1725|
|   CA| Green| 1723|
|   CO|Yellow| 1721|
|   CA| Brown| 1718|
|   CO| Green| 1713|
|   NV|Orange| 1712|
|   TX|Yellow| 1703|
|   NV| Green| 1698|
|   AZ| Brown| 1698|
|   WY| Green| 1695|
|   CO|  Blue| 1695|
|   NM|   Red| 1690|
|   AZ|Orange| 1689|
|   NM|Yellow| 1688|
|   NM| Brown| 1687|
|   UT|Orange| 1684|
+-----+------+-----+
only showing top 20 rows



#### Contamos el valor de los colores agrupándolos por estado y color, sacamos la media de estos, y por último los ordenamos de forma descendiente

In [0]:
avgMnmColorDf = mnmDf.select("State", "Color", "Count").groupBy("State", "Color").agg(count("Count").alias("Total")).groupBy("Color").agg(avg("Total").alias("Average")).orderBy(asc("Average"))
avgMnmColorDf.show()

+------+-------+
| Color|Average|
+------+-------+
|  Blue| 1644.9|
| Brown| 1651.0|
|   Red| 1661.9|
|Orange| 1669.7|
|Yellow| 1679.6|
| Green| 1692.8|
+------+-------+



#### Contamos el valor de los colores, agrupándolos por estado y filtrando los que empiezan por vocal

In [0]:
vowelCountMnmDf = mnmDf.select("State", "Count").where(col("State").rlike("^[AEIOU]")).groupBy("State").agg(count("Count").alias("Total"))
vowelCountMnmDf.show()

+-----+-----+
|State|Total|
+-----+-----+
|   AZ|10001|
|   OR| 9903|
|   UT| 9886|
+-----+-----+



#### Contado de los colores, agrupados por color, calculando el mínimo, máximo, media y suma de estos

In [0]:
statsMnmDf = mnmDf.select("State", "Color", "Count").groupBy("State", "Color").agg(count("Count").alias("Total")).groupBy("Color").agg(min("Total"), max("Total"), avg("Total"), sum("Total"))
statsMnmDf.show()

+------+----------+----------+----------+----------+
| Color|min(Total)|max(Total)|avg(Total)|sum(Total)|
+------+----------+----------+----------+----------+
|Orange|      1595|      1743|    1669.7|     16697|
| Green|      1591|      1779|    1692.8|     16928|
|  Blue|      1603|      1695|    1644.9|     16449|
| Brown|      1532|      1718|    1651.0|     16510|
|Yellow|      1614|      1807|    1679.6|     16796|
|   Red|      1610|      1725|    1661.9|     16619|
+------+----------+----------+----------+----------+



#### Creación de una vista temporal

In [0]:
statsMnmDf.createOrReplaceTempView("stats")
spark.table("stats").select("*").show()

+------+----------+----------+----------+----------+
| Color|min(Total)|max(Total)|avg(Total)|sum(Total)|
+------+----------+----------+----------+----------+
|Orange|      1595|      1743|    1669.7|     16697|
| Green|      1591|      1779|    1692.8|     16928|
|  Blue|      1603|      1695|    1644.9|     16449|
| Brown|      1532|      1718|    1651.0|     16510|
|Yellow|      1614|      1807|    1679.6|     16796|
|   Red|      1610|      1725|    1661.9|     16619|
+------+----------+----------+----------+----------+

