In [1]:
# Global data variables
DATA_PATH = "/Users/luis/Documents/Work/Telefonica/Courses/DATA/" 



# Valores Ausentes

Los valores ausentes en _pyspark_ están identificados como _null_. El método `isNull` permite idenficar los registros nulos y `isNotNull` los no nulos.

In [2]:
from pyspark.sql import functions as F

In [3]:
vancouver_df = spark.read.csv(DATA_PATH + 'crime_in_vancouver.csv', sep=',', header=True, inferSchema=True)

In [6]:
vancouver_df.select(F.col('TYPE')).distinct().show(20, False)

+------------------------------------------------------+
|TYPE                                                  |
+------------------------------------------------------+
|Vehicle Collision or Pedestrian Struck (with Injury)  |
|Other Theft                                           |
|Homicide                                              |
|Vehicle Collision or Pedestrian Struck (with Fatality)|
|Mischief                                              |
|Offence Against a Person                              |
|Theft of Bicycle                                      |
|Break and Enter Commercial                            |
|Theft from Vehicle                                    |
|Break and Enter Residential/Other                     |
|Theft of Vehicle                                      |
+------------------------------------------------------+



In [9]:
vancouver_df.filter(F.col('NEIGHBOURHOOD').isNotNull() &
                    F.col('HOUR').isNull()).show(20, False)

+----+----+-----+---+----+------+-------------+-------------+---+---+
|TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|HUNDRED_BLOCK|NEIGHBOURHOOD|X  |Y  |
+----+----+-----+---+----+------+-------------+-------------+---+---+
+----+----+-----+---+----+------+-------------+-------------+---+---+



In [10]:
vancouver_df.filter(F.col('NEIGHBOURHOOD').isNotNull()).show(4)

+--------------------+----+-----+---+----+------+--------------------+--------------------+---------+----------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|       NEIGHBOURHOOD|        X|         Y|
+--------------------+----+-----+---+----+------+--------------------+--------------------+---------+----------+
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|            West End|490503.48|5459766.67|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST| Renfrew-Collingwood|496145.89|5453740.68|
|Break and Enter R...|2003|    3|  8|   4|    15|     19XX E 12TH AVE|Kensington-Cedar ...|495302.97|5456313.79|
|  Theft from Vehicle|2003|   10|  9|  16|     0|     16XX CHARLES ST|  Grandview-Woodland|494877.89| 5457816.4|
+--------------------+----+-----+---+----+------+--------------------+--------------------+---------+----------+
only showing top 4 rows



 

## Conteo de valores nulos

In [11]:
vancouver_df.filter(F.col('NEIGHBOURHOOD').isNull()).count()

58423

In [12]:
vancouver_df.filter(F.col('TYPE').isNull()).count()

0



### Porcentaje de ausentes por columna

El primer método es menos eficiente que el segundo ya que requiere ejecutar una acción por cada columna. Como norma general en Spark hay que intentar realizar el número mínimo de acciones.

In [13]:
n_rows_vancouver = vancouver_df.count()



__Método 1:__

In [14]:
%%time

for col in vancouver_df.columns:
    
    n_missing = vancouver_df.filter(F.col(col).isNull()).count()
    perc_missing = 100 * n_missing / n_rows_vancouver
    
    print(col, round(perc_missing, 2))

TYPE 0.0
YEAR 0.0
MONTH 0.0
DAY 0.0
HOUR 10.16
MINUTE 10.16
HUNDRED_BLOCK 0.0
NEIGHBOURHOOD 10.58
X 0.0
Y 0.0
CPU times: user 12.8 ms, sys: 4.44 ms, total: 17.2 ms
Wall time: 2.09 s




__Método 2:__

Para una única columna

In [15]:
vancouver_df.select(F.round(F.sum(F.col('NEIGHBOURHOOD').isNull().cast('int')) * 100 / n_rows_vancouver, 2)\
                      .alias('NEIGHBOURHOOD')).show()

+-------------+
|NEIGHBOURHOOD|
+-------------+
|        10.58|
+-------------+





Todas las columnas

In [17]:
%%time 

missing_ops = [F.round(F.sum(F.col(c).isNull().cast('int')) * 100 / n_rows_vancouver, 2).alias(c) 
               for c in vancouver_df.columns]

vancouver_df.select(missing_ops).show()

+----+----+-----+---+-----+------+-------------+-------------+---+---+
|TYPE|YEAR|MONTH|DAY| HOUR|MINUTE|HUNDRED_BLOCK|NEIGHBOURHOOD|  X|  Y|
+----+----+-----+---+-----+------+-------------+-------------+---+---+
| 0.0| 0.0|  0.0|0.0|10.16| 10.16|          0.0|        10.58|0.0|0.0|
+----+----+-----+---+-----+------+-------------+-------------+---+---+

CPU times: user 17.4 ms, sys: 5.28 ms, total: 22.7 ms
Wall time: 570 ms


 

## Eliminación registros nulos

El método `dropna` se utiliza para eliminar registros nulos. Con el parámetro `subset` se indican sobre qué columnas buscar nulos y el parámetro `how` selecciona con qué condición se elimina un registro. Por defecto, `how` está a 'any'.

In [18]:
vancouver_df.dropna(how='all').count()

552055

In [19]:
n_rows_vancouver

552055

In [21]:
vancouver_df.dropna().count()

493619

In [22]:
vancouver_no_missing_df = vancouver_df.dropna(subset=['HOUR', 'MINUTE'])

In [23]:
vancouver_no_missing_df.select(missing_ops).show()

+----+----+-----+---+----+------+-------------+-------------+---+---+
|TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|HUNDRED_BLOCK|NEIGHBOURHOOD|  X|  Y|
+----+----+-----+---+----+------+-------------+-------------+---+---+
| 0.0| 0.0|  0.0|0.0| 0.0|   0.0|          0.0|         0.42|0.0|0.0|
+----+----+-----+---+----+------+-------------+-------------+---+---+





## Imputación de valores nulos

`fillna` imputa los valores nulos de las columnas a un valor fijo elegido.

In [24]:
vancouver_df.show(3)

+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|      NEIGHBOURHOOD|        X|         Y|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|               null|      0.0|       0.0|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|490503.48|5459766.67|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|496145.89|5453740.68|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
only showing top 3 rows





Imputa los valores nulos de las columnas `HOUR` y `MINUTE` por el valor 0, y los de la columna `NEIGHBOURHOOD` por 'Unknown'.

In [28]:
(vancouver_df
 .fillna(0, subset=['HOUR'])
 .fillna(1, subset=['MINUTE'])
).show(3)

+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|      NEIGHBOURHOOD|        X|         Y|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|Offence Against a...|2003|    5| 17|   0|     1|OFFSET TO PROTECT...|               null|      0.0|       0.0|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|490503.48|5459766.67|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|496145.89|5453740.68|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
only showing top 3 rows



In [26]:
vancouver_df.fillna('Unknown', subset=['NEIGHBOURHOOD']).show(3)

+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|      NEIGHBOURHOOD|        X|         Y|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|            Unknown|      0.0|       0.0|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|490503.48|5459766.67|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|496145.89|5453740.68|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
only showing top 3 rows





## Ejercicio 1



Usando el siguiente dataframe

In [None]:
vancouver_df = spark.read.csv(DATA_PATH + 'crime_in_vancouver.csv', sep=',', header=True, inferSchema=True)



- a. Determine que columna(s) tiene(n) el mayor número de nulos
- b. Elimine los registros con mayor número de nulos
- c.1 Complete las variables categóricas con nulos con el valor mayoritario
- c.2 Complete las variables numericas con nulos con la mediana para esa columna. 


In [47]:
list(zip(vancouver_df.columns, 
        list(vancouver_df.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in vancouver_df.columns]).first())))

[('TYPE', 0),
 ('YEAR', 0),
 ('MONTH', 0),
 ('DAY', 0),
 ('HOUR', 56113),
 ('MINUTE', 56113),
 ('HUNDRED_BLOCK', 13),
 ('NEIGHBOURHOOD', 58423),
 ('X', 0),
 ('Y', 0)]

In [45]:
max(zip(vancouver_df.columns, 
        list(vancouver_df.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in vancouver_df.columns]).first())), 
    key=lambda x: x[1])


('NEIGHBOURHOOD', 58423)

In [51]:
max_nulls = (vancouver_df
 .withColumn('missing', 
             sum(vancouver_df[c].isNull().cast('int') for c in vancouver_df.columns))
 .select(F.max(F.col('missing')).alias('max_null_values'))
).first()[0]

In [52]:
# vancouver_df = vancouver_df.filter(F.col('missing') == 3)
not_max_nulls = vancouver_df.dropna(thresh=len(vancouver_df.columns) - max_nulls + 1)

In [53]:
(not_max_nulls
 .withColumn('missing', 
             sum(not_max_nulls[c].isNull().cast('int') for c in not_max_nulls.columns))
 .select(F.max(F.col('missing')).alias('max_null_values'))
).first()[0]

1

In [64]:
for c, t in vancouver_df.dtypes:
    print(c, t)
    if t == 'string':
        input_value = (vancouver_df
                       .groupBy(F.col(c))
                       .agg(F.count('*').alias('conteo'))
                       .orderBy(F.col('conteo').desc())).first()[c]
    else: 
        input_value = vancouver_df.approxQuantile(c, [.5], .001)[0]
    print('INPUT VALUE: ')
    print(input_value)
    vancouver_df = vancouver_df.fillna(input_value, subset=[c])

TYPE string
INPUT VALUE: 
Theft from Vehicle
YEAR int
INPUT VALUE: 
2009.0
MONTH int
INPUT VALUE: 
7.0
DAY int
INPUT VALUE: 
15.0
HOUR int
INPUT VALUE: 
15.0
MINUTE int
INPUT VALUE: 
10.0
HUNDRED_BLOCK string
INPUT VALUE: 
OFFSET TO PROTECT PRIVACY
NEIGHBOURHOOD string
INPUT VALUE: 
Central Business District
X double
INPUT VALUE: 
491516.63
Y double
INPUT VALUE: 
5456851.2


In [60]:
vancouver_df.approxQuantile('X', [.5], .001)[0]

491516.63



## Ejercicio 2

Fuente de los datos: https://www.kaggle.com/abhinav89/telecom-customer

1) Obtener un diccionario de las variables con el valor del porcentaje de nulos que contengan. Ordenarlo, de alguna forma aunque la salida no sea un diccionario, de mayor a menor porcentaje de nulos.

2) Realiza el tratamiento que consideres para los datos nulos, en función del significado de negocio que consideres para cada caso y la cantidad de datos nulos que contenga la columna. Imputar al menos cinco columnas a modo de ejemplo, justificando los valores sustituidos a nivel de negocio.

Hint: consideraremos que la columna no aporta valor si contiene más del 40% de sus valores nulos


In [None]:
df = spark.read.csv(DATA_PATH + 'telecom_customer_churn.csv', sep=',', header=True, inferSchema=True)

In [None]:
df.count()



1) Obtener un diccionario de las variables con el valor del porcentaje de nulos que contengan. Ordenarlo, de alguna forma aunque la salida no sea un diccionario, de mayor a menor porcentaje de nulos.

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui



2) Realiza el tratamiento que consideres para los datos nulos, en función del significado de negocio que consideres para cada caso y la cantidad de datos nulos que contenga la columna. Imputar al menos cinco columnas a modo de ejemplo, justificando los valores sustituidos a nivel de negocio.

Hint: consideraremos que la columna no aporta valor si contiene más del 40% de sus valores nulos

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui