In [None]:
# Global data variables
SANDBOX_NAME = '' # Sandbox Name
DATA_PATH = "/data/sandboxes/" + SANDBOX_NAME + "/data/" 

In [1]:
!apt-get install openjdk-8-jdk -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
from google.colab import drive, auth
findspark.init("spark-3.0.2-bin-hadoop2.7") # SPARK_HOME
from pyspark.sql import SparkSession
spark =  SparkSession.builder.master("local[*]").getOrCreate()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
DATA_PATH='/content/'

In [7]:
from pyspark.sql import functions as F



# Creación de variables dummy



## Leemos el dataframe de pokemon y renombramos las columnas para seguir las buenas prácticas

In [11]:
pokemon_df = spark.read.csv(DATA_PATH + 'pokemon.csv', sep=',', header=True, inferSchema=True)

In [12]:
pokemon_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Type 1: string (nullable = true)
 |-- Type 2: string (nullable = true)
 |-- HP: integer (nullable = true)
 |-- Attack: integer (nullable = true)
 |-- Defense: integer (nullable = true)
 |-- Sp. Atk: integer (nullable = true)
 |-- Sp. Def: integer (nullable = true)
 |-- Speed: integer (nullable = true)
 |-- Generation: integer (nullable = true)
 |-- Legendary: boolean (nullable = true)



In [13]:
pokemon_df = pokemon_df.withColumnRenamed('Sp. Atk', 'sp_atk').withColumnRenamed('Sp. Def', 'sp_def')

In [14]:
pokemon_df = pokemon_df.select([F.col(c).alias(c.lower().replace('\. ', '_').replace(' ', '_')) for c in pokemon_df.columns])

In [15]:
pokemon_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- type_1: string (nullable = true)
 |-- type_2: string (nullable = true)
 |-- hp: integer (nullable = true)
 |-- attack: integer (nullable = true)
 |-- defense: integer (nullable = true)
 |-- sp_atk: integer (nullable = true)
 |-- sp_def: integer (nullable = true)
 |-- speed: integer (nullable = true)
 |-- generation: integer (nullable = true)
 |-- legendary: boolean (nullable = true)





## Eliminamos también la columna '#' ya que no aporta información

In [16]:
pokemon_df = pokemon_df.drop('#')

In [17]:
pokemon_df.show(5)

+-------------+------+------+---+------+-------+------+------+-----+----------+---------+
|         name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+-------------+------+------+---+------+-------+------+------+-----+----------+---------+
|    Bulbasaur| Grass|Poison| 45|    49|     49|    65|    65|   45|         1|    false|
|      Ivysaur| Grass|Poison| 60|    62|     63|    80|    80|   60|         1|    false|
|     Venusaur| Grass|Poison| 80|    82|     83|   100|   100|   80|         1|    false|
|Mega Venusaur| Grass|Poison| 80|   100|    123|   122|   120|   80|         1|    false|
|   Charmander|  Fire|  null| 39|    52|     43|    60|    50|   65|         1|    false|
+-------------+------+------+---+------+-------+------+------+-----+----------+---------+
only showing top 5 rows





## Creamos una variable con las columnas de tipo string y otra con las que vamos a convertir a dummy

In [18]:
pokemon_df.dtypes

[('name', 'string'),
 ('type_1', 'string'),
 ('type_2', 'string'),
 ('hp', 'int'),
 ('attack', 'int'),
 ('defense', 'int'),
 ('sp_atk', 'int'),
 ('sp_def', 'int'),
 ('speed', 'int'),
 ('generation', 'int'),
 ('legendary', 'boolean')]

In [19]:
string_cols = [c for c,t in pokemon_df.dtypes if t in ['string']]
dummy_cols = [c for c in string_cols if c != 'name']

In [20]:
dummy_cols

['type_1', 'type_2']



## Las siguientes librerías son necesarias para la conversión de variables a dummy

In [21]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, ArrayType, DateType

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import re



## Antes de convertir las variables string a dummy, es necesario eliminar los nulos (requisito de las funciones de Spark ML que vamos a utilizar)

In [22]:
pokemon_df_no_nulls = pokemon_df

for element in pokemon_df_no_nulls.columns:
    if pokemon_df_no_nulls.where(pokemon_df_no_nulls[element].isNull()).count() != 0:
        print('\tThe column "{}" has null values'.format(element))
        pokemon_df_no_nulls = pokemon_df_no_nulls.where(pokemon_df_no_nulls[element].isNotNull())
    if pokemon_df_no_nulls.where(pokemon_df_no_nulls[element].isNull()).count() == 0:
        print('The column "{}" does not have null values'.format(element))

	The column "name" has null values
The column "name" does not have null values
The column "type_1" does not have null values
	The column "type_2" has null values
The column "type_2" does not have null values
The column "hp" does not have null values
The column "attack" does not have null values
The column "defense" does not have null values
The column "sp_atk" does not have null values
The column "sp_def" does not have null values
The column "speed" does not have null values
The column "generation" does not have null values
The column "legendary" does not have null values


In [23]:
pokemon_df.where(pokemon_df['type_2'].isNull()).count()

386



## Empezamos usando StringIndexer + OneHotEncoder por separado para ver qué hace cada uno

 

### StringIndexer

StringIndexer codifica una columna de tipo string (categórica) en una columna de índices. Estos índices se encuentran en el intervalo [0, número categorías) y se
ordenan por frecuencia de aparición. La categoría de mayor frecuencia toma el valor 0.

Por ejemplo, si tenemos una columna `color` con dos categorías ('rojo' y 'azul'), donde 'rojo' aparece en más registros que 'azul', al aplicar StringIndexer sobre ella obtendremos una nueva columna `color_indexed` donde los registros con 'rojo' tomarán el valor 0.0 y los de 'azul' tomarán el valor 1.0.

In [24]:
from pyspark.ml.feature import StringIndexer

pokemon_df_no_nulls_indexed = pokemon_df_no_nulls

for element in dummy_cols:
    feature_indexer = StringIndexer(inputCol=element, outputCol=element+'_indexed')
    feature_indexer_model = feature_indexer.fit(pokemon_df_no_nulls)
    pokemon_df_no_nulls_indexed = feature_indexer_model.transform(pokemon_df_no_nulls_indexed)

pokemon_df_no_nulls_indexed.show(2)

+---------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
|     name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|type_1_indexed|type_2_indexed|
+---------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
|Bulbasaur| Grass|Poison| 45|    49|     49|    65|    65|   45|         1|    false|           2.0|           2.0|
|  Ivysaur| Grass|Poison| 60|    62|     63|    80|    80|   60|         1|    false|           2.0|           2.0|
+---------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
only showing top 2 rows





### OneHotEncoder

OneHotEncoder mapea una columna de índices a una columna de vectores binarios, donde en cada vector hay como máximo hay un valor 1. Esta codificación permite incorporar el estudio de variables categóricas en un modelo. Sirve para la creación de variables dummy.

Por ejemplo, sobre nuestra columna `color_indexed`, donde 'rojo' tomaba valor 1.0 y 'azul' tomaba valor 0.0, al aplicar OneHotEncoder obtendremos una columna `color_dummy` cada valor será una tupla con los siguientes campos:

- Campo 1: número total de categorías de la columna
- Campo 2: array que indica los índices del vector donde se han encontrado entradas con un valor diferente de cero
- Campo 3: array que indica qué números se encuentran en esos índices

'rojo'  -> 1 --> (1,[0],[1.0])

'azul'  -> 0 --> (1,[],[]) (en este formato los ceros de un vector no se imprimen)

In [26]:
from pyspark.ml.feature import OneHotEncoder

pokemon_df_no_nulls_ohe = pokemon_df_no_nulls_indexed

for element in dummy_cols:
    feature_ohe = OneHotEncoder(inputCol=element+'_indexed', outputCol=element+'_dummy')
    feature_ohe = feature_ohe.fit(pokemon_df_no_nulls_indexed)
    pokemon_df_no_nulls_ohe = feature_ohe.transform(pokemon_df_no_nulls_ohe)

pokemon_df_no_nulls_ohe.show()

+----------------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+---------------+--------------+
|            name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|type_1_indexed|type_2_indexed|   type_1_dummy|  type_2_dummy|
+----------------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+---------------+--------------+
|       Bulbasaur| Grass|Poison| 45|    49|     49|    65|    65|   45|         1|    false|           2.0|           2.0| (17,[2],[1.0])|(17,[2],[1.0])|
|         Ivysaur| Grass|Poison| 60|    62|     63|    80|    80|   60|         1|    false|           2.0|           2.0| (17,[2],[1.0])|(17,[2],[1.0])|
|        Venusaur| Grass|Poison| 80|    82|     83|   100|   100|   80|         1|    false|           2.0|           2.0| (17,[2],[1.0])|(17,[2],[1.0])|
|   Mega Venusaur| Grass|Poison| 80|   100|    123|   122|   120|   80|     



## Ahora vamos a aplicar StringIndexer y OneHotEncoder en dos pasos usando un Pipeline, y posteriormente separaremos el resultado de OneHotEncoder en diferentes columnas:

Aplicamos el Pipeline de StringIndexer + OneHotEncoder

In [27]:
dictionaries = []

for element in dummy_cols:
    string_indexer = StringIndexer(inputCol=element, outputCol=element+'_category')
    onehotencoder = OneHotEncoder(dropLast=False, inputCol= string_indexer.getOutputCol(), outputCol=element+'_dummy')
    pipeline = Pipeline(stages=[string_indexer, onehotencoder])
    pipeline_model = pipeline.fit(pokemon_df_no_nulls)
    dictionaries.append((element, list(enumerate(pipeline_model.stages[0].labels)), pipeline_model.stages[0]))
    pokemon_df_no_nulls = pipeline_model.transform(pokemon_df_no_nulls)
    pokemon_df_no_nulls = pokemon_df_no_nulls.drop(string_indexer.getOutputCol())

In [28]:
pokemon_df_no_nulls.show(4)

+-------------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
|         name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|  type_1_dummy|  type_2_dummy|
+-------------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
|    Bulbasaur| Grass|Poison| 45|    49|     49|    65|    65|   45|         1|    false|(18,[2],[1.0])|(18,[2],[1.0])|
|      Ivysaur| Grass|Poison| 60|    62|     63|    80|    80|   60|         1|    false|(18,[2],[1.0])|(18,[2],[1.0])|
|     Venusaur| Grass|Poison| 80|    82|     83|   100|   100|   80|         1|    false|(18,[2],[1.0])|(18,[2],[1.0])|
|Mega Venusaur| Grass|Poison| 80|   100|    123|   122|   120|   80|         1|    false|(18,[2],[1.0])|(18,[2],[1.0])|
+-------------+------+------+---+------+-------+------+------+-----+----------+---------+--------------+--------------+
only showing top 4 rows



In [29]:
dictionaries

[('type_1',
  [(0, 'Water'),
   (1, 'Bug'),
   (2, 'Grass'),
   (3, 'Normal'),
   (4, 'Rock'),
   (5, 'Fire'),
   (6, 'Ghost'),
   (7, 'Steel'),
   (8, 'Dark'),
   (9, 'Dragon'),
   (10, 'Ground'),
   (11, 'Psychic'),
   (12, 'Electric'),
   (13, 'Poison'),
   (14, 'Ice'),
   (15, 'Fighting'),
   (16, 'Fairy'),
   (17, 'Flying')],
  StringIndexerModel: uid=StringIndexer_ef83a0e5a95e, handleInvalid=error),
 ('type_2',
  [(0, 'Flying'),
   (1, 'Ground'),
   (2, 'Poison'),
   (3, 'Psychic'),
   (4, 'Fighting'),
   (5, 'Grass'),
   (6, 'Fairy'),
   (7, 'Steel'),
   (8, 'Dark'),
   (9, 'Dragon'),
   (10, 'Ghost'),
   (11, 'Ice'),
   (12, 'Rock'),
   (13, 'Water'),
   (14, 'Fire'),
   (15, 'Electric'),
   (16, 'Normal'),
   (17, 'Bug')],
  StringIndexerModel: uid=StringIndexer_8aebaa9f5da9, handleInvalid=error)]



Después, sobre el resultado de OneHotEncoder (un vector) vamos a tomar cada elemento y separarlo en diferentes columnas para crear las variables dummy

In [30]:
for element in dictionaries:
    pokemon_df_no_nulls = (pokemon_df_no_nulls.withColumn('activated_indices'+element[0], F.udf(lambda x: x.toArray().tolist(), ArrayType(DoubleType()))
                        (F.col(element[0]+'_dummy'))))
    
    vocab = [re.sub(r'\W', '_', value) for value in element[-1].labels]
    pokemon_df_no_nulls = pokemon_df_no_nulls.select(pokemon_df_no_nulls.columns + [F.col("activated_indices"+element[0])[i] for i in range(len(vocab))])
    
    dictionary = {"activated_indices"+element[0]+"[{0}]".format(x): element[0]+'_'+vocab[x] for x in range(len(vocab))}

    # Renombramos las columnas con el elemento que sea
    pokemon_df_no_nulls = pokemon_df_no_nulls.selectExpr(["{0} as {1}".format(x, x) if x not in dictionary else "{0} as {1}".format(x, dictionary[x]) 
                                  for x in pokemon_df_no_nulls.columns])
    pokemon_df_no_nulls = pokemon_df_no_nulls.drop('activated_indices'+element[0], element[0]+'_dummy')

In [31]:
pokemon_df_no_nulls.show(5)

+-------------+------+------+---+------+-------+------+------+-----+----------+---------+------------+----------+------------+-------------+-----------+-----------+------------+------------+-----------+-------------+-------------+--------------+---------------+-------------+----------+---------------+------------+-------------+-------------+-------------+-------------+--------------+---------------+------------+------------+------------+-----------+-------------+------------+----------+-----------+------------+-----------+---------------+-------------+----------+
|         name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|type_1_Water|type_1_Bug|type_1_Grass|type_1_Normal|type_1_Rock|type_1_Fire|type_1_Ghost|type_1_Steel|type_1_Dark|type_1_Dragon|type_1_Ground|type_1_Psychic|type_1_Electric|type_1_Poison|type_1_Ice|type_1_Fighting|type_1_Fairy|type_1_Flying|type_2_Flying|type_2_Ground|type_2_Poison|type_2_Psychic|type_2_Fighting|type_2_Grass|type_2_Fairy|ty

In [32]:
pokemon_df_no_nulls.toPandas()[['name'] + [col for col in pokemon_df_no_nulls.toPandas().columns if col.startswith('type_2')]]

Unnamed: 0,name,type_2,type_2_Flying,type_2_Ground,type_2_Poison,type_2_Psychic,type_2_Fighting,type_2_Grass,type_2_Fairy,type_2_Steel,type_2_Dark,type_2_Dragon,type_2_Ghost,type_2_Ice,type_2_Rock,type_2_Water,type_2_Fire,type_2_Electric,type_2_Normal,type_2_Bug
0,Bulbasaur,Poison,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ivysaur,Poison,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Venusaur,Poison,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mega Venusaur,Poison,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Charizard,Flying,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409,Diancie,Fairy,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
410,Mega Diancie,Fairy,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411,Hoopa Confined,Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
412,Hoopa Unbound,Dark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0




## Funciones para quitar nulos + comprobar nulos + StringIndexer  + OneHotEncoder + separación de salida de OneHotEncoder en diferentes columnas

In [33]:
def remove_nulls(df):
    df_no_nulls = df

    for element in df_no_nulls.columns:
        if df_no_nulls.where(df_no_nulls[element].isNull()).count() != 0:
            print('\tThe column "{}" has null values'.format(element))
            df_no_nulls = df_no_nulls.where(df_no_nulls[element].isNotNull())
        if df_no_nulls.where(df_no_nulls[element].isNull()).count() == 0:
            print('The column "{}" does not have null values'.format(element))
            
    return df_no_nulls
    
def check_nulls(df):
    
    existing_nulls = False
    
    for element in df.columns:
        if df.where(df[element].isNull()).count() != 0:
            print('\tThe column "{}" has null values'.format(element))
            existing_nulls = True
            break
        if df.where(df[element].isNull()).count() == 0:
            print('The column "{}" does not have null values'.format(element))
            
    return existing_nulls


def get_dummies_spark(df, dummy_cols):   
     
    # check nulls
    
    existing_nulls = check_nulls(df)
    
    if existing_nulls == False:
            
        # StringIndexer + OneHotEncoder

        dictionaries = []

        for element in dummy_cols:
            print("StringIndexer + OneHotEncoder for column ", element)

            string_indexer = StringIndexer(inputCol=element, outputCol=element+'_category')
            onehotencoder = OneHotEncoder(dropLast=False, inputCol= string_indexer.getOutputCol(), outputCol=element+'_dummy')
            pipeline = Pipeline(stages=[string_indexer, onehotencoder])
            pipeline_model = pipeline.fit(df)
            dictionaries.append((element, list(enumerate(pipeline_model.stages[0].labels)), pipeline_model.stages[0]))
            df = pipeline_model.transform(df)
            df = df.drop(string_indexer.getOutputCol())

        # divide OneHotEncoder output in different columns

        for element in dictionaries:
            print("Divide OneHotEncoder output in several columns for original column ", element[0])

            df = (df.withColumn('activated_indices'+element[0], F.udf(lambda x: x.toArray().tolist(), ArrayType(DoubleType()))
                                (F.col(element[0]+'_dummy'))))

            vocab = [re.sub(r'\W', '_', value) for value in element[-1].labels]
            df = df.select(df.columns + [F.col("activated_indices"+element[0])[i] for i in range(len(vocab))])

            dictionary = {"activated_indices"+element[0]+"[{0}]".format(x): element[0]+'_'+vocab[x] for x in range(len(vocab))}

            # rename columns
            df = df.selectExpr(["{0} as {1}".format(x, x) if x not in dictionary else "{0} as {1}".format(x, dictionary[x]) 
                                          for x in df.columns])
            df = df.drop('activated_indices'+element[0], element[0]+'_dummy')
    
    else:
        print("There are nulls in your dataframe, please remove them or fill them before creating dummy features")
        
        
    return df

In [34]:
pokemon_df_no_nulls = remove_nulls(pokemon_df)

	The column "name" has null values
The column "name" does not have null values
The column "type_1" does not have null values
	The column "type_2" has null values
The column "type_2" does not have null values
The column "hp" does not have null values
The column "attack" does not have null values
The column "defense" does not have null values
The column "sp_atk" does not have null values
The column "sp_def" does not have null values
The column "speed" does not have null values
The column "generation" does not have null values
The column "legendary" does not have null values


In [35]:
pokemon_df_no_nulls_dummies = get_dummies_spark(pokemon_df_no_nulls, dummy_cols)

The column "name" does not have null values
The column "type_1" does not have null values
The column "type_2" does not have null values
The column "hp" does not have null values
The column "attack" does not have null values
The column "defense" does not have null values
The column "sp_atk" does not have null values
The column "sp_def" does not have null values
The column "speed" does not have null values
The column "generation" does not have null values
The column "legendary" does not have null values
StringIndexer + OneHotEncoder for column  type_1
StringIndexer + OneHotEncoder for column  type_2
Divide OneHotEncoder output in several columns for original column  type_1
Divide OneHotEncoder output in several columns for original column  type_2


In [36]:
pokemon_df_no_nulls_dummies.show(4)

+-------------+------+------+---+------+-------+------+------+-----+----------+---------+------------+----------+------------+-------------+-----------+-----------+------------+------------+-----------+-------------+-------------+--------------+---------------+-------------+----------+---------------+------------+-------------+-------------+-------------+-------------+--------------+---------------+------------+------------+------------+-----------+-------------+------------+----------+-----------+------------+-----------+---------------+-------------+----------+
|         name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|type_1_Water|type_1_Bug|type_1_Grass|type_1_Normal|type_1_Rock|type_1_Fire|type_1_Ghost|type_1_Steel|type_1_Dark|type_1_Dragon|type_1_Ground|type_1_Psychic|type_1_Electric|type_1_Poison|type_1_Ice|type_1_Fighting|type_1_Fairy|type_1_Flying|type_2_Flying|type_2_Ground|type_2_Poison|type_2_Psychic|type_2_Fighting|type_2_Grass|type_2_Fairy|ty

EJERCICIO


*   Carguen el dataset crime_in_vancouver.
*   Extrae las variables categóricas (no todas las variables string son categóricas como el nomnbre en el dataser pokemon)
*   Limpien columnas categóricas de valores nulos.
*   Apliquen StringIndexer en columnas categóricas.
*   Aplique OneHotEncoder.
*   Graficar el top 10 de vecindarios más peligrosos (gráfico de barras).
*   Grafique los vecindatios más peligrosos





In [49]:
DATA_PATH='/content/'

In [50]:
vancouver_df = spark.read.csv(DATA_PATH + 'crime_in_vancouver.csv', sep=',', header=True, inferSchema=True)

In [41]:
vancouver_df.show(3)

+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|      NEIGHBOURHOOD|        X|         Y|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|               null|      0.0|       0.0|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|           West End|490503.48|5459766.67|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST|Renfrew-Collingwood|496145.89|5453740.68|
+--------------------+----+-----+---+----+------+--------------------+-------------------+---------+----------+
only showing top 3 rows



In [51]:
vancouver_df.count()

552055

In [42]:
vancouver_df.dtypes

[('TYPE', 'string'),
 ('YEAR', 'int'),
 ('MONTH', 'int'),
 ('DAY', 'int'),
 ('HOUR', 'int'),
 ('MINUTE', 'int'),
 ('HUNDRED_BLOCK', 'string'),
 ('NEIGHBOURHOOD', 'string'),
 ('X', 'double'),
 ('Y', 'double')]

In [65]:
vancouver_df.groupBy('TYPE').agg(F.count('TYPE')).orderBy().show(100)

+--------------------+-----------+
|                TYPE|count(TYPE)|
+--------------------+-----------+
|Vehicle Collision...|      22702|
|         Other Theft|      55184|
|            Homicide|        228|
|Vehicle Collision...|        262|
|            Mischief|      73569|
|Offence Against a...|      55885|
|    Theft of Bicycle|      26981|
|Break and Enter C...|      35042|
|  Theft from Vehicle|     180705|
|Break and Enter R...|      62254|
|    Theft of Vehicle|      39243|
+--------------------+-----------+



In [62]:
check_nulls(vancouver_df.select("TYPE", "NEIGHBOURHOOD"))

The column "TYPE" does not have null values
	The column "NEIGHBOURHOOD" has null values


True

In [63]:
vancouver_df.where(vancouver_df["NEIGHBOURHOOD"].isNull()).count()

58423

In [None]:
vancouver_df.fillna('Unknown', subset=['NEIGHBOURHOOD']).show(3)

In [56]:
string_cols = [c for c,t in vancouver_df.dtypes if t in ['string']]
dummy_cols = [c for c in string_cols if c != 'name']

In [59]:
dummy_cols = [:0,2]

SyntaxError: ignored