In [1]:
SANDBOX_NAME = ''# Sandbox Name
DATA_PATH = "/data/sandboxes/"+SANDBOX_NAME+"/data/"



# Spark ML Estandarización

Cargamos un dataset con información sobre cirugías por cáncer torácico. Este dataset tiene como variable objetivo el fallecimiento o supervivencia de los pacientes tras un año de la operación en cuestión.



### Crear SparkSession

In [2]:
# Respuesta

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()



### Cargar datos y comprobar schema

In [3]:
# Respuesta

surgery = spark.read.csv(DATA_PATH+'data/thoracic_cancer.csv', sep=',', header=True, inferSchema=True)

surgery.printSchema()

root
 |-- DGN: string (nullable = true)
 |-- PRE4: double (nullable = true)
 |-- PRE5: double (nullable = true)
 |-- PRE6: string (nullable = true)
 |-- PRE7: string (nullable = true)
 |-- PRE8: string (nullable = true)
 |-- PRE9: string (nullable = true)
 |-- PRE10: string (nullable = true)
 |-- PRE11: string (nullable = true)
 |-- PRE14: string (nullable = true)
 |-- PRE17: string (nullable = true)
 |-- PRE19: string (nullable = true)
 |-- PRE25: string (nullable = true)
 |-- PRE30: string (nullable = true)
 |-- PRE32: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- Risk1Yr: integer (nullable = true)



In [4]:
# Respuesta

surgery.show()

+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+
| DGN|PRE4|PRE5|PRE6|PRE7|PRE8|PRE9|PRE10|PRE11|PRE14|PRE17|PRE19|PRE25|PRE30|PRE32|AGE|Risk1Yr|
+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+
|DGN2|2.88|2.16|PRZ1|   F|   F|   F|    T|    T| OC14|    F|    F|    F|    T|    F| 60|      0|
|DGN3| 3.4|1.88|PRZ0|   F|   F|   F|    F|    F| OC12|    F|    F|    F|    T|    F| 51|      0|
|DGN3|2.76|2.08|PRZ1|   F|   F|   F|    T|    F| OC11|    F|    F|    F|    T|    F| 59|      0|
|DGN3|3.68|3.04|PRZ0|   F|   F|   F|    F|    F| OC11|    F|    F|    F|    F|    F| 54|      0|
|DGN3|2.44|0.96|PRZ2|   F|   T|   F|    T|    T| OC11|    F|    F|    F|    T|    F| 73|      1|
|DGN3|2.48|1.88|PRZ1|   F|   F|   F|    T|    F| OC11|    F|    F|    F|    F|    F| 51|      0|
|DGN3|4.36|3.28|PRZ1|   F|   F|   F|    T|    F| OC12|    T|    F|    F|    T|    F| 59|      1|
|DGN2|3.19| 2.5|PRZ1|   F|   F



Al ver el tipo de variables de este dataset, se aprecia que la mayoría son categóricas. Sin embargo, sí hay algunas de tipo numeríca y se deberían estandarizar.



### StandardScaler

Primero se crea el VectorAssembler con las variables numéricas

In [5]:
# Respuesta

from pyspark.ml.feature import VectorAssembler

vectorassembler = VectorAssembler(inputCols=[element[0] for element in surgery.dtypes if element[1] != 'string' and element[0] != 'Risk1Yr'], outputCol='assembled')

surgery = vectorassembler.transform(surgery)


In [6]:
# Respuesta

from pyspark.ml.feature import StandardScaler

standardscaler = StandardScaler(inputCol=vectorassembler.getOutputCol(), outputCol='standarized_zscore')
standardscaler_model = standardscaler.fit(surgery)
surgery_zscore = standardscaler_model.transform(surgery)

surgery_zscore.show()

+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+----------------+--------------------+
| DGN|PRE4|PRE5|PRE6|PRE7|PRE8|PRE9|PRE10|PRE11|PRE14|PRE17|PRE19|PRE25|PRE30|PRE32|AGE|Risk1Yr|       assembled|  standarized_zscore|
+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+----------------+--------------------+
|DGN2|2.88|2.16|PRZ1|   F|   F|   F|    T|    T| OC14|    F|    F|    F|    T|    F| 60|      0|[2.88,2.16,60.0]|[3.30504525563893...|
|DGN3| 3.4|1.88|PRZ0|   F|   F|   F|    F|    F| OC12|    F|    F|    F|    T|    F| 51|      0| [3.4,1.88,51.0]|[3.90178953790707...|
|DGN3|2.76|2.08|PRZ1|   F|   F|   F|    T|    F| OC11|    F|    F|    F|    T|    F| 59|      0|[2.76,2.08,59.0]|[3.16733503665397...|
|DGN3|3.68|3.04|PRZ0|   F|   F|   F|    F|    F| OC11|    F|    F|    F|    F|    F| 54|      0|[3.68,3.04,54.0]|[4.22311338220530...|
|DGN3|2.44|0.96|PRZ2|   F|   T|   F|    T|    T| OC11| 



A continuación la estandarizacion con el parámetro _withMean=True_. Cuya fórmula es la siguiente:

\begin{equation*}
z = \frac{x - \mu} {\sigma}
\end{equation*}

In [7]:
# Respuesta

standardscaler = StandardScaler(inputCol=vectorassembler.getOutputCol(), outputCol='standarized_zscore_mean', withMean=True)
standardscaler_model = standardscaler.fit(surgery)
surgery_zscore = standardscaler_model.transform(surgery)

surgery_zscore.show()

+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+----------------+-----------------------+
| DGN|PRE4|PRE5|PRE6|PRE7|PRE8|PRE9|PRE10|PRE11|PRE14|PRE17|PRE19|PRE25|PRE30|PRE32|AGE|Risk1Yr|       assembled|standarized_zscore_mean|
+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+----------------+-----------------------+
|DGN2|2.88|2.16|PRZ1|   F|   F|   F|    T|    T| OC14|    F|    F|    F|    T|    F| 60|      0|[2.88,2.16,60.0]|   [-0.4609141496062...|
|DGN3| 3.4|1.88|PRZ0|   F|   F|   F|    F|    F| OC12|    F|    F|    F|    T|    F| 51|      0| [3.4,1.88,51.0]|   [0.13583013266193...|
|DGN3|2.76|2.08|PRZ1|   F|   F|   F|    T|    F| OC11|    F|    F|    F|    T|    F| 59|      0|[2.76,2.08,59.0]|   [-0.5986243685911...|
|DGN3|3.68|3.04|PRZ0|   F|   F|   F|    F|    F| OC11|    F|    F|    F|    F|    F| 54|      0|[3.68,3.04,54.0]|   [0.45715397696016...|
|DGN3|2.44|0.96|PRZ2|   F|   T|   



Se aprecian en los resultados las diferencias entre _withmean = False_ y _withmean = True_. Examinemos la columna "PRE4"

In [8]:
# Respuesta

surgery.describe('PRE4').show()

+-------+------------------+
|summary|              PRE4|
+-------+------------------+
|  count|               470|
|   mean|3.2816382978723415|
| stddev|0.8713950270684679|
|    min|              1.44|
|    max|               6.3|
+-------+------------------+





En la siguiente tabla se resumen las cuentas realizadas (puede haber diferencias en los decimales).

|PRE4|  standarized_zscore |standarized_zscore_mean|
|:---:|:---:|:-------:|
|2.88|\begin{equation*} \frac{2.88} {0.87}= 3.30 \end{equation*}|\begin{equation*} \frac{2.88 - 3.28} {0.87}= -0.46
\end{equation*}|
|3.4|\begin{equation*} \frac{3.4} {0.87}= 3.90 \end{equation*}|\begin{equation*} \frac{3.4 - 3.28} {0.87}= 0.13
\end{equation*}|
|2.76|\begin{equation*} \frac{2.76} {0.87}= 3.16 \end{equation*}|\begin{equation*} \frac{2.76 - 3.28} {0.87}= -0.59
\end{equation*}|




### MinMaxScaler

En este caso se aplica la siguiente fórmula:

\begin{equation*}
x_{min\_max} = \frac{x - x_{min}} {(x_{max} - x_{min})}
\end{equation*}

In [9]:
# Respuesta

from pyspark.ml.feature import MinMaxScaler

# VectorAssembler is already created

minmaxscaler = MinMaxScaler(inputCol=vectorassembler.getOutputCol(), outputCol='standarized_minmax')
minmaxscaler_model = minmaxscaler.fit(surgery)
surgery_minmax = minmaxscaler_model.transform(surgery)

surgery_minmax.show()

+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+----------------+--------------------+
| DGN|PRE4|PRE5|PRE6|PRE7|PRE8|PRE9|PRE10|PRE11|PRE14|PRE17|PRE19|PRE25|PRE30|PRE32|AGE|Risk1Yr|       assembled|  standarized_minmax|
+----+----+----+----+----+----+----+-----+-----+-----+-----+-----+-----+-----+-----+---+-------+----------------+--------------------+
|DGN2|2.88|2.16|PRZ1|   F|   F|   F|    T|    T| OC14|    F|    F|    F|    T|    F| 60|      0|[2.88,2.16,60.0]|[0.29629629629629...|
|DGN3| 3.4|1.88|PRZ0|   F|   F|   F|    F|    F| OC12|    F|    F|    F|    T|    F| 51|      0| [3.4,1.88,51.0]|[0.40329218106995...|
|DGN3|2.76|2.08|PRZ1|   F|   F|   F|    T|    F| OC11|    F|    F|    F|    T|    F| 59|      0|[2.76,2.08,59.0]|[0.27160493827160...|
|DGN3|3.68|3.04|PRZ0|   F|   F|   F|    F|    F| OC11|    F|    F|    F|    F|    F| 54|      0|[3.68,3.04,54.0]|[0.46090534979423...|
|DGN3|2.44|0.96|PRZ2|   F|   T|   F|    T|    T| OC11| 