In [None]:
SANDBOX_NAME = '' # Sandbox Name
DATA_PATH = "/data/sandboxes/"+SANDBOX_NAME+"/data/"



# VectorAssembler: ejemplo completo

En este notebook hacemos un resumen de cómo incorporar varios tipos de variables diferentes a nuestro modelo



## Carga del dataset

In [4]:
cars = spark.read.csv(DATA_PATH+'automobile.csv', sep=';', header=True, inferSchema=True)

cars.printSchema()

root
 |-- normalized_losses: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- aspiration: string (nullable = true)
 |-- num_of_doors: string (nullable = true)
 |-- body_style: string (nullable = true)
 |-- drive_wheels: string (nullable = true)
 |-- engine_location: string (nullable = true)
 |-- wheel_base: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- height: double (nullable = true)
 |-- curb_weight: integer (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- num_of_cylinders: string (nullable = true)
 |-- engine_size: integer (nullable = true)
 |-- fuel_system: string (nullable = true)
 |-- bore: double (nullable = true)
 |-- stroke: double (nullable = true)
 |-- compression_ratio: double (nullable = true)
 |-- horsepower: integer (nullable = true)
 |-- peak_rpm: integer (nullable = true)
 |-- city_mpg: integer (nullable = true)
 |-- highway_mpg: intege



## Extraer columnas numéricas

In [5]:
numerical_cols = [x[0] for x in cars.dtypes if x[1] in ['double', 'float', 'int'] and x[0]!='symboling']



## Eliminar nulos

In [6]:
cars = cars.dropna(how='any')



## StringIndexer para columnas de tipo string o boolean

In [7]:
from pyspark.ml.feature import StringIndexer

string_columns = [x[0] for x in cars.dtypes if x[1] in ['string', 
                                                                'bool'] 
                 and x[0] not in ['num_of_doors', 'num_of_cylinders']]

string_indexer = {}

def column_indexer(df, string_columns):
    for x in string_columns:
        feature_indexer = StringIndexer(inputCol= x, outputCol=x+'_indexed')
        feature_indexer_model = feature_indexer.fit(df)
        string_indexer[x] = feature_indexer_model
        df = feature_indexer_model.transform(df)
    return df, string_indexer

cars, string_indexer_dict = column_indexer(cars, string_columns)

In [8]:
indexed_columns = [x for x in cars.columns if '_indexed' in x]
indexed_columns

['make_indexed',
 'fuel_type_indexed',
 'aspiration_indexed',
 'body_style_indexed',
 'drive_wheels_indexed',
 'engine_location_indexed',
 'engine_type_indexed',
 'fuel_system_indexed']



## OneHotEncoder para columnas de tipo string o boolean

In [9]:
from pyspark.ml.feature import OneHotEncoder

def column_encoder(df, indexed_columns):
    for x in indexed_columns:
        feature_ohe = OneHotEncoder(inputCol=x, outputCol=x+'_encoded')
        df = feature_ohe.transform(df)
    return df

cars = column_encoder(cars, indexed_columns)

In [10]:
encoded_columns = [x for x in cars.columns if '_encoded' in x]
encoded_columns

['make_indexed_encoded',
 'fuel_type_indexed_encoded',
 'aspiration_indexed_encoded',
 'body_style_indexed_encoded',
 'drive_wheels_indexed_encoded',
 'engine_location_indexed_encoded',
 'engine_type_indexed_encoded',
 'fuel_system_indexed_encoded']



## VectorAssembler con todas las columnas transformadas

In [11]:
count_vectorizer = []

In [12]:
to_vector_assembler = encoded_columns + count_vectorizer + numerical_cols

In [13]:
from pyspark.ml.feature import VectorAssembler
vectorassembler = VectorAssembler(inputCols = to_vector_assembler, outputCol='assembled_features')
cars = vectorassembler.transform(cars)



## Eliminar columnas previas ya añadidas con VectorAssembler

In [14]:
import pyspark.sql.functions as F
for c in indexed_columns:
    cars = cars.drop(F.col(c))
cars.columns

['normalized_losses',
 'make',
 'fuel_type',
 'aspiration',
 'num_of_doors',
 'body_style',
 'drive_wheels',
 'engine_location',
 'wheel_base',
 'length',
 'width',
 'height',
 'curb_weight',
 'engine_type',
 'num_of_cylinders',
 'engine_size',
 'fuel_system',
 'bore',
 'stroke',
 'compression_ratio',
 'horsepower',
 'peak_rpm',
 'city_mpg',
 'highway_mpg',
 'price',
 'symboling',
 'make_indexed_encoded',
 'fuel_type_indexed_encoded',
 'aspiration_indexed_encoded',
 'body_style_indexed_encoded',
 'drive_wheels_indexed_encoded',
 'engine_location_indexed_encoded',
 'engine_type_indexed_encoded',
 'fuel_system_indexed_encoded',
 'assembled_features']

In [15]:
import pyspark.sql.functions as F
for c in encoded_columns:
    cars = cars.drop(F.col(c))
cars.columns

['normalized_losses',
 'make',
 'fuel_type',
 'aspiration',
 'num_of_doors',
 'body_style',
 'drive_wheels',
 'engine_location',
 'wheel_base',
 'length',
 'width',
 'height',
 'curb_weight',
 'engine_type',
 'num_of_cylinders',
 'engine_size',
 'fuel_system',
 'bore',
 'stroke',
 'compression_ratio',
 'horsepower',
 'peak_rpm',
 'city_mpg',
 'highway_mpg',
 'price',
 'symboling',
 'assembled_features']