In [None]:
!apt-get install openjdk-8-jdk -qq > /dev/null

!wget -q http://www-eu.apache.org/dist/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz

!tar xf spark-3.0.2-bin-hadoop2.7.tgz

!pip install -q findspark

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"

In [None]:
import findspark

findspark.init() # SPARK_HOME

from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()



# Spark ML Transformación de Variables

Cargamos un dataset con información sobre cuán seguro es un coche. Con este dataset se estudiarán funciones muy importantes de Spark ML.



### Crear SparkSession
Nota: en DATIO no es necesario crear la SparkSession ya que al iniciar un notebook con el Kernel PySpark Python3 - Spark 2.1.0  se crea automáticamente.

In [None]:
# Respuesta

from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()



### Cargar datos y comprobar schema

El método _read.csv_ tiene un parámetro _inferSchema_. El mismo permite inferir el tipo de las columnas, para ello requiere recorrer una vez más los datos y por defecto es _False_.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Respuesta
DATA_PATH = '/content/drive/MyDrive/2021Q1_DSF_contents/DATA/'
cars = spark.read.csv(DATA_PATH+'automobile.csv', sep=';', header=True, inferSchema=True)

cars.printSchema()

root
 |-- normalized_losses: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- aspiration: string (nullable = true)
 |-- num_of_doors: string (nullable = true)
 |-- body_style: string (nullable = true)
 |-- drive_wheels: string (nullable = true)
 |-- engine_location: string (nullable = true)
 |-- wheel_base: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- height: double (nullable = true)
 |-- curb_weight: integer (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- num_of_cylinders: string (nullable = true)
 |-- engine_size: integer (nullable = true)
 |-- fuel_system: string (nullable = true)
 |-- bore: double (nullable = true)
 |-- stroke: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- horsepower: integer (nullable = true)
 |-- peak_rpm: integer (nullable = true)
 |-- city_mpg: integer (nullable = true)
 |-- highway_mpg: intege



### VectorAssembler



Un _VectorAssembler_ es un transformador de múltiples características ( _features_ ) en una sola columna de tipo vector. Lo construiremos con todas las variables menos con la columna objetivo 'symboling'.

In [None]:
cars.select([element for element in cars.columns if element != 'symboling']).printSchema()

root
 |-- normalized_losses: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- aspiration: string (nullable = true)
 |-- num_of_doors: string (nullable = true)
 |-- body_style: string (nullable = true)
 |-- drive_wheels: string (nullable = true)
 |-- engine_location: string (nullable = true)
 |-- wheel_base: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- height: double (nullable = true)
 |-- curb_weight: integer (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- num_of_cylinders: string (nullable = true)
 |-- engine_size: integer (nullable = true)
 |-- fuel_system: string (nullable = true)
 |-- bore: double (nullable = true)
 |-- stroke: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- horsepower: integer (nullable = true)
 |-- peak_rpm: integer (nullable = true)
 |-- city_mpg: integer (nullable = true)
 |-- highway_mpg: intege

In [None]:
# Respuesta

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[element for element in cars.columns if element != 'symboling'], outputCol='assembled_features')

cars_assembled = assembler.transform(cars)

cars_assembled.show()

IllegalArgumentException: ignored



Estudiando el error se lee:
    **IllegalArgumentException: 'Data type StringType is not supported.'**
    
Recordamos que VectorAssembler solo acepta los siguientes tipos de datos:

- numéricos
- booleanos
- vector
    



Estudiamos el tipo de cada una de las variables y hacemos VectorAssembler para todas las variables cuyos tipos sí están permitidos. Es decir el _VectorAssembler_ no debe incluir columnas de tipo _string_.

In [None]:
# Respuesta

cars.dtypes

[('normalized_losses', 'int'),
 ('make', 'string'),
 ('fuel_type', 'string'),
 ('aspiration', 'string'),
 ('num_of_doors', 'string'),
 ('body_style', 'string'),
 ('drive_wheels', 'string'),
 ('engine_location', 'string'),
 ('wheel_base', 'double'),
 ('length', 'double'),
 ('width', 'double'),
 ('height', 'double'),
 ('curb_weight', 'int'),
 ('engine_type', 'string'),
 ('num_of_cylinders', 'string'),
 ('engine_size', 'int'),
 ('fuel_system', 'string'),
 ('bore', 'double'),
 ('stroke', 'double'),
 ('compression_ratio', 'double'),
 ('horsepower', 'int'),
 ('peak_rpm', 'int'),
 ('city_mpg', 'int'),
 ('highway_mpg', 'int'),
 ('price', 'int'),
 ('symboling', 'int')]

In [None]:
# Respuesta

columns_assemble = [element[0] for element in cars.dtypes if element[1] != 'string' and element[0] != 'symboling']

assembler = VectorAssembler(inputCols=columns_assemble, outputCol='assembled_features')

cars_assembled = assembler.transform(cars)

cars_assembled.show()

Py4JJavaError: ignored



Ha vuelto a fallar, ¿qué ocurre?

En la version de Spark 2.1 el mensaje no parece aportar muchos indicios  que el error. Sin embargo, en la version de Spark 2.2  el error se describe de la siguiente manera:
    
**Caused by: org.apache.spark.SparkException: Values to assemble cannot be null.**

Así pues, se tiene que se deben haber filtrado correctamente los valores nulos antes de crear un VectorAssembler.




Quitaremos todas las filas con nulos:

In [None]:
# Respuesta

cars_no_nulls = cars.cache()

for element in cars.columns:
    if cars.where(cars[element].isNull()).count() != 0:
        print('\tThe column "{}" has null values'.format(element))
        cars_no_nulls = cars_no_nulls.where(cars[element].isNotNull())
    if cars.where(cars[element].isNull()).count() == 0:
        print('The column "{}" does not have null values'.format(element))

The column "normalized_losses" does not have null values
	The column "make" has null values
The column "fuel_type" does not have null values
The column "aspiration" does not have null values
The column "num_of_doors" does not have null values
	The column "body_style" has null values
The column "drive_wheels" does not have null values
The column "engine_location" does not have null values
The column "wheel_base" does not have null values
The column "length" does not have null values
The column "width" does not have null values
The column "height" does not have null values
The column "curb_weight" does not have null values
The column "engine_type" does not have null values
The column "num_of_cylinders" does not have null values
The column "engine_size" does not have null values
The column "fuel_system" does not have null values
	The column "bore" has null values
The column "stroke" does not have null values
The column "compression_ratio" does not have null values
	The column "horsepower"

In [None]:
# Respuesta

assembler = VectorAssembler(inputCols=columns_assemble, outputCol='assembled_features')

cars_assembled = assembler.transform(cars_no_nulls) # please bear in mind, we are using cars_no_nulls

cars_assembled.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+--------------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|  assembled_features|
+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+--------------------+
|              113|     mazda|      gas|       std|      [four]|     seda

In [None]:
cars_assembled.select(columns_assemble[:3] + ['assembled_features']).show()

+-----------------+----------+------+--------------------+
|normalized_losses|wheel_base|length|  assembled_features|
+-----------------+----------+------+--------------------+
|              113|      93.1| 166.8|[113.0,93.1,166.8...|
|               87|      95.7| 158.7|[87.0,95.7,158.7,...|
|              125|      96.3| 172.4|[125.0,96.3,172.4...|
|              148|      93.7| 157.3|[148.0,93.7,157.3...|
|              150|      95.3| 169.0|[150.0,95.3,169.0...|
|              194|      91.3| 170.7|[194.0,91.3,170.7...|
|                0|     102.0| 191.7|[0.0,102.0,191.7,...|
|              128|     100.4| 181.7|[128.0,100.4,181....|
|              161|     107.9| 186.7|[161.0,107.9,186....|
|              115|      98.8| 177.8|[115.0,98.8,177.8...|
|              192|     101.2| 176.8|[192.0,101.2,176....|
|              168|      95.1| 162.4|[168.0,95.1,162.4...|
|              161|      93.7| 157.3|[161.0,93.7,157.3...|
|              110|     103.3| 174.6|[110.0,103.3,174...



**¡¡Ahora se ha podido crear el VectorAssembler!!**

Pero únicamente se han seleccionado aquellas variables que no son de tipo string. 



### StringIndexer



* Hagamos StringIndexer para la variable 'make' que representa la marca del auto, cogiendo el dataset inicial, `cars`

In [None]:
# Respuesta

from pyspark.ml.feature import StringIndexer

feature_indexer = StringIndexer(inputCol='make', outputCol='make_indexed')

feature_indexer_model = feature_indexer.fit(cars)

cars_indexed = feature_indexer_model.transform(cars)

cars_indexed.show()

Py4JJavaError: ignored



De nuevo se produce un error. En la versión de Spark 2.1 el mensaje no parece aportar muchos indicios acerca del mismo.
En la versión de Spark 2.2 el error dice lo siguiente: **Caused by: org.apache.spark.SparkException: StringIndexer encountered NULL value. To handle or skip NULLS, try setting StringIndexer.handleInvalid.**

Es importante haber tratado correctamente los nulos antes.

¿Qué desventaja tendría utilizar handleInvalid tal como se indica?

In [None]:
# Respuesta

from pyspark.ml.feature import StringIndexer

feature_indexer = StringIndexer(inputCol='make', outputCol='make_indexed')

feature_indexer_model = feature_indexer.fit(cars_no_nulls) # Please bear in mind, now we are using cars_no_nulls

cars_indexed = feature_indexer_model.transform(cars_no_nulls)

cars_indexed.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|make_indexed|
+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+
|              113|     mazda|      gas|       std|      [four]|     sedan|         fwd|         



Si se accede a `feature_indexer_model.labels` se obtiene un vector construido por `StringIndexer`. El vector está ordenado por la frecuencia de los valores, por lo tanto el valor más frecuente tiene índice 0.

In [None]:
# Respuesta

feature_indexer_model.labels

['toyota',
 'mazda',
 'nissan',
 'honda',
 'mitsubishi',
 'subaru',
 'peugot',
 'volkswagen',
 'volvo',
 'dodge',
 'bmw',
 'mercedes-benz',
 'audi',
 'plymouth',
 'saab',
 'porsche',
 'isuzu',
 'alfa-romero',
 'jaguar',
 'chevrolet',
 'renault',
 'mercury']



¿Qué más variables se pueden transformar con StringIndexer para ser incluidas en los modelos de Machine Learning? Ojo con `num_of_doors`. Transforma todas las demás restantes y actualiza el dataset sin nulos.

In [None]:
cars_indexed.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|make_indexed|
+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+
|              113|     mazda|      gas|       std|      [four]|     sedan|         fwd|         

In [None]:
cars_indexed.dtypes

[('normalized_losses', 'int'),
 ('make', 'string'),
 ('fuel_type', 'string'),
 ('aspiration', 'string'),
 ('num_of_doors', 'string'),
 ('body_style', 'string'),
 ('drive_wheels', 'string'),
 ('engine_location', 'string'),
 ('wheel_base', 'double'),
 ('length', 'double'),
 ('width', 'double'),
 ('height', 'double'),
 ('curb_weight', 'int'),
 ('engine_type', 'string'),
 ('num_of_cylinders', 'string'),
 ('engine_size', 'int'),
 ('fuel_system', 'string'),
 ('bore', 'double'),
 ('stroke', 'double'),
 ('compression_ratio', 'double'),
 ('horsepower', 'int'),
 ('peak_rpm', 'int'),
 ('city_mpg', 'int'),
 ('highway_mpg', 'int'),
 ('price', 'int'),
 ('symboling', 'int'),
 ('make_indexed', 'double')]

In [None]:
categorical_columns = [x[0] for x in cars_indexed.dtypes if x[1] in ['string', 'bool'] and x[0]!='make']
categorical_columns

['fuel_type',
 'aspiration',
 'num_of_doors',
 'body_style',
 'drive_wheels',
 'engine_location',
 'engine_type',
 'num_of_cylinders',
 'fuel_system']

In [None]:
for x in categorical_columns:
    print(x)
    feature_indexer = StringIndexer(inputCol=x, outputCol=x+'_indexed')

    feature_index = feature_indexer.fit(cars_indexed) # Please bear in mind, now we are using cars_no_nulls
    cars_indexed = feature_index.transform(cars_indexed)



fuel_type
aspiration
num_of_doors
body_style
drive_wheels
engine_location
engine_type
num_of_cylinders
fuel_system


In [None]:
cars_indexed.show(5)

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+-----------------+------------------+--------------------+------------------+--------------------+-----------------------+-------------------+------------------------+-------------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|make_indexed|fuel_type_indexed|aspiration_indexed|num_of_doors_indexed|body_style_indexed|drive_wheels_indexed|engine_location_indexed|engine_type_indexed|num_of_cylinders_indexed|fuel_system_indexed|
+-----------------+-----



### CountVectorizer



* Hagamos CountVectorizer para la variable 'num_of_doors'. 

| num_of_doors   |
| -------------: |
| [four]| 
| [two,four]     | 


In [None]:
# Respuesta

from pyspark.ml.feature import CountVectorizer

feature_cv = CountVectorizer(inputCol='num_of_doors', outputCol='doors_counter')

model_cv = feature_cv.fit(cars)

cars_cv = feature_cv.transform(cars)

cars_cv.show()

IllegalArgumentException: ignored



Mirando el schema se ve que 'num_of_doors' no tiene el formato correcto (es de tipo _string_). Vamos a convertirlo a _ArrayType(StringType())_

In [None]:
cars.printSchema()

root
 |-- normalized_losses: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- aspiration: string (nullable = true)
 |-- num_of_doors: string (nullable = true)
 |-- body_style: string (nullable = true)
 |-- drive_wheels: string (nullable = true)
 |-- engine_location: string (nullable = true)
 |-- wheel_base: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- height: double (nullable = true)
 |-- curb_weight: integer (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- num_of_cylinders: string (nullable = true)
 |-- engine_size: integer (nullable = true)
 |-- fuel_system: string (nullable = true)
 |-- bore: double (nullable = true)
 |-- stroke: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- horsepower: integer (nullable = true)
 |-- peak_rpm: integer (nullable = true)
 |-- city_mpg: integer (nullable = true)
 |-- highway_mpg: intege

In [None]:
# Respuesta

import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType, DoubleType

cars_no_nulls = cars_no_nulls.withColumn('num_of_doors', F.udf(
    lambda value: value.replace('[', '').replace(']','').split(','), ArrayType(StringType()))(F.col('num_of_doors')))

cars_no_nulls.printSchema()

root
 |-- normalized_losses: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- aspiration: string (nullable = true)
 |-- num_of_doors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- body_style: string (nullable = true)
 |-- drive_wheels: string (nullable = true)
 |-- engine_location: string (nullable = true)
 |-- wheel_base: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- height: double (nullable = true)
 |-- curb_weight: integer (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- num_of_cylinders: string (nullable = true)
 |-- engine_size: integer (nullable = true)
 |-- fuel_system: string (nullable = true)
 |-- bore: double (nullable = true)
 |-- stroke: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- horsepower: integer (nullable = true)
 |-- peak_rpm: integer (nullable = true)
 |-- city_mpg: int

In [None]:
# Respuesta

cars_no_nulls.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|
+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+
|              113|     mazda|      gas|       std|      [four]|     sedan|         fwd|          front|      93.1| 166.8| 64.2|  54.1| 



Volvamos a probar otra vez:

In [None]:
# Respuesta

feature_cv = CountVectorizer(inputCol='num_of_doors', outputCol='doors_counter')

model_cv = feature_cv.fit(cars_no_nulls)

cars_cv = model_cv.transform(cars_no_nulls)

cars_cv.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+-------------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|      doors_counter|
+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+-------------------+
|              113|     mazda|      gas|       std|      [four]|     sedan| 



En la siguiente tabla se puede apreciar la conversión realizada con _CountVectorizer_

| num_of_doors   | doors_counter   |
| -------------: | -------------: |
| [four]| (2,[0],[1.0]) |
| [two,four]     | (2,[0,1],[1.0,1.0])|

La columna *doors_counter* contiene un _CountVectorizerModel_ que es un vector con tres campos. El primero indica la cantidad de valores posibles que tiene la columna *num_of_doors*, en este caso es 2. El segundo campo indica los índices del vector donde se han encontrado entradas con un valor diferente de cero. El tercer campo indica qué números se encuentran en esos índices. Se puede saber con *model_cv.vocabulary* que 'four' corresponde a que en el índice 0 del vector haya un 1.0 (el vector de 2 posiciones sería [0, 1]), y 'two' corresponde a que en la posición 1 del vector haya un 1.0 (el vector de dos posiciones sería [1, 0])





### OneHotEncoder



* Hagamos OneHotEncoder para la variable 'make' (recordar que contiene las marcas de distintos autos)

In [None]:
# Respuesta

from pyspark.ml.feature import OneHotEncoder

feature_ohe = OneHotEncoder(inputCol='make', outputCol='make_onehotencoder')
feature_ohe = feature_ohe.fit(cars_no_nulls)
cars_ohe = feature_ohe.transform(cars_no_nulls)

cars_ohe.show()

IllegalArgumentException: ignored



Salta el siguiente error: **IllegalArgumentException: 'requirement failed: Input column must be of type NumericType but got StringType'**

Para hacer un OneHotEncoder, equivalente a variable dummies, es necesarios pasar antes por _StringIndexer_. Ya hemos realizado esto, por favor recuerda la columna *make_indexed*.

Reutilizamos el ejemplo anterior:

In [None]:
# Respuesta

cars_indexed.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+-----------------+------------------+--------------------+------------------+--------------------+-----------------------+-------------------+------------------------+-------------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|make_indexed|fuel_type_indexed|aspiration_indexed|num_of_doors_indexed|body_style_indexed|drive_wheels_indexed|engine_location_indexed|engine_type_indexed|num_of_cylinders_indexed|fuel_system_indexed|
+-----------------+-----

In [None]:
cars_indexed.dtypes

[('normalized_losses', 'int'),
 ('make', 'string'),
 ('fuel_type', 'string'),
 ('aspiration', 'string'),
 ('num_of_doors', 'string'),
 ('body_style', 'string'),
 ('drive_wheels', 'string'),
 ('engine_location', 'string'),
 ('wheel_base', 'double'),
 ('length', 'double'),
 ('width', 'double'),
 ('height', 'double'),
 ('curb_weight', 'int'),
 ('engine_type', 'string'),
 ('num_of_cylinders', 'string'),
 ('engine_size', 'int'),
 ('fuel_system', 'string'),
 ('bore', 'double'),
 ('stroke', 'double'),
 ('compression_ratio', 'double'),
 ('horsepower', 'int'),
 ('peak_rpm', 'int'),
 ('city_mpg', 'int'),
 ('highway_mpg', 'int'),
 ('price', 'int'),
 ('symboling', 'int'),
 ('make_indexed', 'double'),
 ('fuel_type_indexed', 'double'),
 ('aspiration_indexed', 'double'),
 ('num_of_doors_indexed', 'double'),
 ('body_style_indexed', 'double'),
 ('drive_wheels_indexed', 'double'),
 ('engine_location_indexed', 'double'),
 ('engine_type_indexed', 'double'),
 ('num_of_cylinders_indexed', 'double'),
 ('fuel



Se aprecia que el dataframe `cars_indexed` ya incluye la variable `make_indexed` y es tipo numérica. Empezamos a trabajar a partir de aquí:

In [None]:
# Respuesta

encoder = OneHotEncoder(inputCol="make_indexed", outputCol="make_onehotencoder")
encoder = encoder.fit(cars_indexed)
cars_encoded = encoder.transform(cars_indexed)
cars_encoded.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+-----------------+------------------+--------------------+------------------+--------------------+-----------------------+-------------------+------------------------+-------------------+------------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|make_indexed|fuel_type_indexed|aspiration_indexed|num_of_doors_indexed|body_style_indexed|drive_wheels_indexed|engine_location_indexed|engine_type_indexed|num_of_cylinders_indexed|fuel_system_indexed|make_o



### Pasar resultados a columnas independientes

Tanto al hacer el CountVectorizer como el OneHotEncoder, los resultados se encuentran en un vector en una sola columna. Sería muy útil separar los resultados en columnas distintas.

Veamos cómo hacerlo.



**Para el caso de CountVectorizer**

Un posible ejemplo podría ser generar una columna *doors_four* y una columna *doors_two*.

| num_of_doors   | doors_counter   |doors_four|doors_two|
| -------------: | -------------: | -------------:| -------------:|
| [four]| (2,[0],[1.0]) |1.0|0.0|
| [two,four]     | (2,[0,1],[1.0,1.0])| 1.0|1.0|

Para esto, primero se crea la columna '*activated_index*', transformando *doors_counter* a tipo Vector Array.

In [None]:
# Respuesta

from pyspark.sql.types import DoubleType, ArrayType
import pyspark.sql.functions as F

user_function = F.udf(lambda x: x.toArray().tolist(), ArrayType(DoubleType()))

cars_cv = (cars_cv.withColumn('activated_index', user_function(F.col('doors_counter'))))

cars_cv.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+-------------------+---------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|      doors_counter|activated_index|
+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+-------------------+---------------+
|              113|     mazd



Ahora debemos modificar el vector resultante, *activated_index*, para que cada elemento se encuentre en una columna distinta. También debemos saber los distintos valores/elementos sobre los que se ha hecho el count, esto se puede hacer mediante  *model_cv.vocabulary*

In [None]:
# Respuesta

vocab = model_cv.vocabulary

In [None]:
vocab

['four', 'two']



Partimos nuestra columna 'activated_index' y renombramos las columnas resultantes con el tipo de evento correspondiente:

In [None]:
# Respuesta

cars_cv = cars_cv.select(cars_cv.columns + [(F.col("activated_index")[i]).alias('doors_' + v) for i, v in enumerate(vocab)])

cars_cv.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+-------------------+---------------+----------+---------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|      doors_counter|activated_index|doors_four|doors_two|
+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+-------------------+---



¡Ya está hecho!




**Para el caso OneHotEncoder**

El proceso será equivalente con la diferencia de la procedencia de las distintas categorías.



Primero se crea una columna _ArrayType()_

In [None]:
# Respuesta

from pyspark.sql.types import DoubleType

cars_encoded = (cars_encoded.withColumn('make_activated_index', F.udf(lambda x: x.toArray().tolist(), ArrayType(DoubleType()))(F.col('make_onehotencoder'))))

cars_encoded.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+-----------------+------------------+--------------------+------------------+--------------------+-----------------------+-------------------+------------------------+-------------------+------------------+--------------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_system|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|symboling|make_indexed|fuel_type_indexed|aspiration_indexed|num_of_doors_indexed|body_style_indexed|drive_wheels_indexed|engine_location_indexed|engine_type_indexed|num_of_cylinders_indexed|fuel_



Modificar el vector resultante, *make_activated_index*, para que cada elemento se encuentre en una columna distinta



Debemos saber los distintos elementos sobre los que se ha hecho el count. La diferencia aquí es que se ha hecho un StringIndexer antes del OneHotEncoder y se debe volver a StringIndexer para recuperar las categorías.


In [None]:
# Respuesta

vocab = feature_indexer_model.labels
print(vocab)

['toyota', 'mazda', 'nissan', 'honda', 'mitsubishi', 'subaru', 'peugot', 'volkswagen', 'volvo', 'dodge', 'bmw', 'mercedes-benz', 'audi', 'plymouth', 'saab', 'porsche', 'isuzu', 'alfa-romero', 'jaguar', 'chevrolet', 'renault', 'mercury']




Al inspeccionar las categorias observamos que aparecen símbolos no permitidos. Esto debe a que existen macas de autos como "mercedes-benz". El guión medio "-" no esta permitido para los nombres de las columnas. Tomando esto en cuenta, partimos nuestra columna 'make_activated_index' en porciones y renombramos las columnas resultantes con la marca correspondiente:

In [None]:
# Respuesta

cars_encoded = cars_encoded.select(cars_encoded.columns + [(F.col("make_activated_index")[i]).alias('make_' + v.replace('-','_')) for i, v in enumerate(vocab)])

cars_encoded.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+-----------------+------------------+--------------------+------------------+--------------------+-----------------------+-------------------+------------------------+-------------------+------------------+--------------------+-----------+----------+-----------+----------+---------------+-----------+-----------+---------------+----------+----------+--------+------------------+---------+-------------+---------+------------+----------+----------------+-----------+--------------+------------+------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel



* Estudiamos comportamiento de OneHotEncoder

In [None]:
# Respuesta

vocab

['toyota',
 'mazda',
 'nissan',
 'honda',
 'mitsubishi',
 'subaru',
 'peugot',
 'volkswagen',
 'volvo',
 'dodge',
 'bmw',
 'mercedes-benz',
 'audi',
 'plymouth',
 'saab',
 'porsche',
 'isuzu',
 'alfa-romero',
 'jaguar',
 'chevrolet',
 'renault',
 'mercury']



La última categoría es 'mercury'. Veamos qué pasa:

In [None]:
# Respuesta

cars_encoded.where(F.col('make')=='mercury').show(1)

+-----------------+-------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+-----------------+------------------+--------------------+------------------+--------------------+-----------------------+-------------------+------------------------+-------------------+------------------+--------------------+-----------+----------+-----------+----------+---------------+-----------+-----------+---------------+----------+----------+--------+------------------+---------+-------------+---------+------------+----------+----------------+-----------+--------------+------------+------------+
|normalized_losses|   make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel_syste



Se aprecia cómo 'make_mercury' toma valor nulo. De hecho, siempre la última columna toma el valor nulo.

In [None]:
# Respuesta

cars_encoded.select('make_mercury').distinct().show()

+------------+
|make_mercury|
+------------+
|        null|
+------------+





**¿Por qué?**

Porque OneHotEncoder supone que las columnas no nulas son las únicas categorías posibles para esa columna y por lo tanto, una de ellas es combinación lineal del resto. Por esta razón desestima la última de las categorías.

Hay situaciones de selección de variables donde todas deben estar presentes. Veamos como forzar la aparición de esta categoría también.

In [None]:
# Respuesta

# A column with the name of the last column of the dataframe (make_mercury) is created. It contains the result
# of applying the UDF to the column make: if make has value mercury (last value of the StringIndexer array),
# then it will be 1. If not, it will be 0.
cars_encoded = (cars_encoded.withColumn(cars_encoded.columns[-1], 
                F.udf(lambda value: 1.0 if value == vocab[-1] else 0.0, DoubleType())(F.col('make'))))


cars_encoded.show()

+-----------------+----------+---------+----------+------------+----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+---------+------------+-----------------+------------------+--------------------+------------------+--------------------+-----------------------+-------------------+------------------------+-------------------+------------------+--------------------+-----------+----------+-----------+----------+---------------+-----------+-----------+---------------+----------+----------+--------+------------------+---------+-------------+---------+------------+----------+----------------+-----------+--------------+------------+------------+
|normalized_losses|      make|fuel_type|aspiration|num_of_doors|body_style|drive_wheels|engine_location|wheel_base|length|width|height|curb_weight|engine_type|num_of_cylinders|engine_size|fuel

In [None]:
# Respuesta

cars_encoded.select('make_mercury').distinct().show()

+------------+
|make_mercury|
+------------+
|         0.0|
|         1.0|
+------------+

