# RETO BIG DATA

En este reto, estaremos haciendo varias querys con pyspark.
Trabajaremos con la base de datos del Iris

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('myApp')
sc = SparkContext(conf=conf)

from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("learnig_pyspark") \
                    .config(conf=SparkConf()).getOrCreate()
spark

In [2]:
import pandas as pd
from pyspark.sql import Row
from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, StructField, StructType
from pyspark.sql import functions as F

url_file = "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/" \
            "raw/d546eaee765268bf2f487608c537c05e22e4b221/iris.csv"

iris = spark.createDataFrame(pd.read_csv(url_file))
iris


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]


## **NOTA **
```
##recuerda que si lo quieres hacer con queries de sql tienes que hacer la vista temporal de la bdd
iris.createOrReplaceTempView("iris")

## EJEMPLO QUERY
query = """
SELECT *
FROM iris
"""

spark.sql(query).show()
```

In [3]:
iris.createOrReplaceTempView("iris")

# EJERCICIOS

### 1. ¿Cuantos registros tiene la base de datos?

<p style="color:#288CE1;">Spark SQL</p>

In [4]:
iris.count()

150

<p style="color:#995BD8;">SQL Query</p>

In [5]:
query = """
SELECT count(*)
FROM iris
"""

spark.sql(query).show()

+--------+
|count(1)|
+--------+
|     150|
+--------+



### 2. ¿Cúales son los valores min y max de cada una de las variables?

<p style="color:#288CE1;">Spark SQL</p>

In [6]:
from pyspark.sql.functions import max, min
attributes = iris.columns

for attribute in attributes:
    iris.select([max(attribute), min(attribute)]).show()

+-----------------+-----------------+
|max(sepal_length)|min(sepal_length)|
+-----------------+-----------------+
|              7.9|              4.3|
+-----------------+-----------------+

+----------------+----------------+
|max(sepal_width)|min(sepal_width)|
+----------------+----------------+
|             4.4|             2.0|
+----------------+----------------+

+-----------------+-----------------+
|max(petal_length)|min(petal_length)|
+-----------------+-----------------+
|              6.9|              1.0|
+-----------------+-----------------+

+----------------+----------------+
|max(petal_width)|min(petal_width)|
+----------------+----------------+
|             2.5|             0.1|
+----------------+----------------+

+------------+------------+
|max(species)|min(species)|
+------------+------------+
|   virginica|      setosa|
+------------+------------+



<p style="color:#995BD8;">SQL Query</p>

In [7]:
attributes = iris.columns

for attribute in attributes:
    query = """
    SELECT MAX(""" + attribute + """), MIN(""" + attribute + """)
    FROM iris
    """

    spark.sql(query).show()

+-----------------+-----------------+
|max(sepal_length)|min(sepal_length)|
+-----------------+-----------------+
|              7.9|              4.3|
+-----------------+-----------------+

+----------------+----------------+
|max(sepal_width)|min(sepal_width)|
+----------------+----------------+
|             4.4|             2.0|
+----------------+----------------+

+-----------------+-----------------+
|max(petal_length)|min(petal_length)|
+-----------------+-----------------+
|              6.9|              1.0|
+-----------------+-----------------+

+----------------+----------------+
|max(petal_width)|min(petal_width)|
+----------------+----------------+
|             2.5|             0.1|
+----------------+----------------+

+------------+------------+
|max(species)|min(species)|
+------------+------------+
|   virginica|      setosa|
+------------+------------+



### 3. ¿Cúantos registros tienen un valor de sepal length mayor a 5.8? ¿corresponden a una clase en especifico?

<p style="color:#703C36;">70 registros tienen un largo de sépalo mayor a 5.8, y se encuentran en dos especies: virgínica y versicolor. </p>

<p style="color:#288CE1;">Spark SQL</p>

In [8]:
iris.filter(F.col("sepal_length") > 5.8).count()

70

In [9]:
iris.filter(F.col("sepal_length") > 5.8).groupBy("species").count().show()

+----------+-----+
|   species|count|
+----------+-----+
| virginica|   44|
|versicolor|   26|
+----------+-----+



<p style="color:#995BD8;">SQL Query</p>

In [10]:
query = """
SELECT COUNT(*)
FROM iris
WHERE sepal_length > 5.8
"""

spark.sql(query).show()

+--------+
|count(1)|
+--------+
|      70|
+--------+



In [11]:
query = """
SELECT species, COUNT(*)
FROM iris
WHERE sepal_length > 5.8
GROUP BY species
"""

spark.sql(query).show()

+----------+--------+
|   species|count(1)|
+----------+--------+
| virginica|      44|
|versicolor|      26|
+----------+--------+



### 4. Selecciona aquellos registros cuya especie sea virginica

<p style="color:#288CE1;">Spark SQL</p>

In [12]:
iris.filter(F.col("species") == "virginica").show()

+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width|  species|
+------------+-----------+------------+-----------+---------+
|         6.3|        3.3|         6.0|        2.5|virginica|
|         5.8|        2.7|         5.1|        1.9|virginica|
|         7.1|        3.0|         5.9|        2.1|virginica|
|         6.3|        2.9|         5.6|        1.8|virginica|
|         6.5|        3.0|         5.8|        2.2|virginica|
|         7.6|        3.0|         6.6|        2.1|virginica|
|         4.9|        2.5|         4.5|        1.7|virginica|
|         7.3|        2.9|         6.3|        1.8|virginica|
|         6.7|        2.5|         5.8|        1.8|virginica|
|         7.2|        3.6|         6.1|        2.5|virginica|
|         6.5|        3.2|         5.1|        2.0|virginica|
|         6.4|        2.7|         5.3|        1.9|virginica|
|         6.8|        3.0|         5.5|        2.1|virginica|
|       

<p style="color:#995BD8;">SQL Query</p>

In [13]:
query = """
SELECT *
FROM iris
WHERE species = "virginica"
"""

spark.sql(query).show()

+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width|  species|
+------------+-----------+------------+-----------+---------+
|         6.3|        3.3|         6.0|        2.5|virginica|
|         5.8|        2.7|         5.1|        1.9|virginica|
|         7.1|        3.0|         5.9|        2.1|virginica|
|         6.3|        2.9|         5.6|        1.8|virginica|
|         6.5|        3.0|         5.8|        2.2|virginica|
|         7.6|        3.0|         6.6|        2.1|virginica|
|         4.9|        2.5|         4.5|        1.7|virginica|
|         7.3|        2.9|         6.3|        1.8|virginica|
|         6.7|        2.5|         5.8|        1.8|virginica|
|         7.2|        3.6|         6.1|        2.5|virginica|
|         6.5|        3.2|         5.1|        2.0|virginica|
|         6.4|        2.7|         5.3|        1.9|virginica|
|         6.8|        3.0|         5.5|        2.1|virginica|
|       

### 5. ¿cúales registros tienen un petal_width menor a 1.5?

<p style="color:#288CE1;">Spark SQL</p>

In [14]:
iris.filter(F.col("petal_width") < 1.5).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

<p style="color:#995BD8;">SQL Query</p>

In [15]:
query = """
SELECT *
FROM iris
WHERE petal_width < 1.5
"""

spark.sql(query).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

### 6. Encuentra los registros que tengan un valor de petal_length entre 1.0 y 3.0

<p style="color:#288CE1;">Spark SQL</p>

In [16]:
iris.filter(F.col("petal_length").between(1,3)).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

<p style="color:#995BD8;">SQL Query</p>

In [17]:
query = """
SELECT *
FROM iris
WHERE petal_length > 1 AND  petal_length < 3 
"""

spark.sql(query).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

### 7. ¿Cuantos registros hay de cada clase? 

<p style="color:#288CE1;">Spark SQL</p>

In [18]:
iris.groupBy("species").count().show()

+----------+-----+
|   species|count|
+----------+-----+
| virginica|   50|
|versicolor|   50|
|    setosa|   50|
+----------+-----+



<p style="color:#995BD8;">SQL Query</p>

In [19]:
query = """
SELECT species, COUNT(species)
FROM iris
GROUP BY species
"""

spark.sql(query).show()

+----------+--------------+
|   species|count(species)|
+----------+--------------+
| virginica|            50|
|versicolor|            50|
|    setosa|            50|
+----------+--------------+



### 8. Encuentra la clase setosa usando sus tres primeras letras

<p style="color:#288CE1;">Spark SQL</p>

In [20]:
iris.filter( F.col("species").startswith("set")).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

<p style="color:#995BD8;">SQL Query</p>

In [21]:
query = """
SELECT *
FROM iris
WHERE species LIKE 'set%'
"""

spark.sql(query).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

### Extra: Hay una clase que se separa de las demás, ¿comparando sus diferentes variables puedes encontrarla?


<p style="color:#703C36;">Setosa</p>
    
<p style="color:#703C36;">Comparando los valores máximos y mínimos que se tiene de cada atributo por especie, se puede notar que el pétalo de la clase setosa es más pequeño tanto en ancho y largo, comparado con el de las otras especies.</p>



<p style="color:#288CE1;">Spark SQL</p>

In [24]:
from pyspark.sql.functions import max, min

for attribute in attributes:
    iris.groupBy("species").agg(max(attribute), min(attribute)).show()

+----------+-----------------+-----------------+
|   species|max(sepal_length)|min(sepal_length)|
+----------+-----------------+-----------------+
| virginica|              7.9|              4.9|
|versicolor|              7.0|              4.9|
|    setosa|              5.8|              4.3|
+----------+-----------------+-----------------+

+----------+----------------+----------------+
|   species|max(sepal_width)|min(sepal_width)|
+----------+----------------+----------------+
| virginica|             3.8|             2.2|
|versicolor|             3.4|             2.0|
|    setosa|             4.4|             2.3|
+----------+----------------+----------------+

+----------+-----------------+-----------------+
|   species|max(petal_length)|min(petal_length)|
+----------+-----------------+-----------------+
| virginica|              6.9|              4.5|
|versicolor|              5.1|              3.0|
|    setosa|              1.9|              1.0|
+----------+-----------------+--

<p style="color:#995BD8;">SQL Query</p>

In [23]:
attributes = iris.columns

for attribute in attributes:
    query = """
    SELECT species, MAX(""" + attribute + """), MIN(""" + attribute + """)
    FROM iris
    GROUP BY species;
    """

    spark.sql(query).show()

+----------+-----------------+-----------------+
|   species|max(sepal_length)|min(sepal_length)|
+----------+-----------------+-----------------+
| virginica|              7.9|              4.9|
|versicolor|              7.0|              4.9|
|    setosa|              5.8|              4.3|
+----------+-----------------+-----------------+

+----------+----------------+----------------+
|   species|max(sepal_width)|min(sepal_width)|
+----------+----------------+----------------+
| virginica|             3.8|             2.2|
|versicolor|             3.4|             2.0|
|    setosa|             4.4|             2.3|
+----------+----------------+----------------+

+----------+-----------------+-----------------+
|   species|max(petal_length)|min(petal_length)|
+----------+-----------------+-----------------+
| virginica|              6.9|              4.5|
|versicolor|              5.1|              3.0|
|    setosa|              1.9|              1.0|
+----------+-----------------+--