In [61]:
!pip install pyspark


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Importando dados e criando sessão

In [62]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.appName('semana2').getOrCreate()

df = spark.read.parquet('part-00000-00341ba7-0a7c-4fef-a81e-1066725a64b1-c000.snappy.parquet')

# seleção de features

In [63]:
df_parquet = df.drop('area_total','tipo_anuncio','tipo_uso','tipo','tipo_unidade')
df_parquet.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+----+------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|iptu| valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+----+------+
|03a386b6-7ab8-4ef...|    0|       43|        1|[Churrasqueira, A...|      2|  null|   1|            Realengo|  Zona Oeste|       285|null| 22999|
|1fe78d41-b8e0-4d2...|    0|       44|        1|                  []|      2|     0|   0|               Irajá|  Zona Norte|       170|   0|110000|
|1fa1c1e5-e98c-433...|    4|       55|        1|                  []|      2|     0|   1|              Cosmos|  Zona Oeste|      null|null|115000|
|a6ab01ae-3d40-40e...|    2|       55|        1|                  []|      2|     0|   0|        Tomás Coelho|  Zona N

# Convertendo tipo de dados

In [64]:
df_parquet_ = df_parquet.withColumn('andar',df_parquet.andar.cast(IntegerType()))\
                        .withColumn('banheiros',df_parquet.banheiros.cast(IntegerType()))\
                        .withColumn('suites',df_parquet.suites.cast(IntegerType()))\
                        .withColumn('quartos',df_parquet.quartos.cast(IntegerType()))\
                        .withColumn('vaga',df_parquet.vaga.cast(IntegerType()))\
                        .withColumn('area_util',df_parquet.area_util.cast(DoubleType()))\
                        .withColumn('condominio',df_parquet.condominio.cast(DoubleType()))\
                        .withColumn('iptu',df_parquet.iptu.cast(DoubleType()))\
                        .withColumn('valor',df_parquet.valor.cast(DoubleType()))
                        
df_parquet_.printSchema()

root
 |-- id: string (nullable = true)
 |-- andar: integer (nullable = true)
 |-- area_util: double (nullable = true)
 |-- banheiros: integer (nullable = true)
 |-- caracteristicas: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- quartos: integer (nullable = true)
 |-- suites: integer (nullable = true)
 |-- vaga: integer (nullable = true)
 |-- bairro: string (nullable = true)
 |-- zona: string (nullable = true)
 |-- condominio: double (nullable = true)
 |-- iptu: double (nullable = true)
 |-- valor: double (nullable = true)



# Transformando lista vazia em nulo na coluna caracteristicas



In [65]:
df_caracteristicas = df_parquet_.withColumn('caracteristicas',f.when(f.size(f.col('caracteristicas')) == 0, f.lit(None)).otherwise(f.col('caracteristicas')))
df_caracteristicas.select('caracteristicas').show()

+--------------------+
|     caracteristicas|
+--------------------+
|[Churrasqueira, A...|
|                null|
|                null|
|                null|
|                null|
|[Condomínio fecha...|
|[Churrasqueira, C...|
|[Churrasqueira, P...|
|[Churrasqueira, E...|
|   [Salão de festas]|
|[Condomínio fecha...|
|[Playground, Chur...|
|                null|
|[Condomínio fecha...|
|[Academia, Churra...|
|[Academia, Condom...|
|[Academia, Condom...|
|                null|
|[Salão de festas,...|
|[Animais permitidos]|
+--------------------+
only showing top 20 rows



# Preenchendo com 0 valores inteiros nulos

In [66]:
df_caracteristicas_ = df_caracteristicas.fillna(value=0,subset=['banheiros','quartos','suites','vaga','condominio','iptu'])
df_caracteristicas_.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|  iptu|   valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|     0|   1|            Realengo|  Zona Oeste|     285.0|   0.0| 22999.0|
|1fe78d41-b8e0-4d2...|    0|     44.0|        1|                null|      2|     0|   0|               Irajá|  Zona Norte|     170.0|   0.0|110000.0|
|1fa1c1e5-e98c-433...|    4|     55.0|        1|                null|      2|     0|   1|              Cosmos|  Zona Oeste|       0.0|   0.0|115000.0|
|a6ab01ae-3d40-40e...|    2|     55.0|        1|                null|      2|     0|   0|     

# Dropando linhas que contenham valores nulos nas colunas 

In [67]:
df_drop = df_caracteristicas_.na.drop(subset=['id','bairro','zona','zona'])
df_drop.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|  iptu|   valor|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+------+--------+
|03a386b6-7ab8-4ef...|    0|     43.0|        1|[Churrasqueira, A...|      2|     0|   1|            Realengo|  Zona Oeste|     285.0|   0.0| 22999.0|
|1fe78d41-b8e0-4d2...|    0|     44.0|        1|                null|      2|     0|   0|               Irajá|  Zona Norte|     170.0|   0.0|110000.0|
|1fa1c1e5-e98c-433...|    4|     55.0|        1|                null|      2|     0|   1|              Cosmos|  Zona Oeste|       0.0|   0.0|115000.0|
|a6ab01ae-3d40-40e...|    2|     55.0|        1|                null|      2|     0|   0|     

# Dropando string vazia no campo zona

In [68]:
df_drop_ = df_drop.where(f.trim(f.col('zona')) != '')
df_drop_.select('zona').groupBy('zona').count().show()

+------------+-----+
|        zona|count|
+------------+-----+
|  Zona Norte|11897|
|  Zona Oeste|32979|
|Zona Central| 1144|
|    Zona Sul|20531|
+------------+-----+



# Dummy Classifier na coluna Zona

In [69]:
df_dummy_zona = df_drop_\
    .groupBy('id')\
    .pivot('zona')\
    .agg(f.lit(1))\
    .na\
    .fill(0)

df_join = df_drop_.join(df_dummy_zona,'id',how='inner')
df_join.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+-------+---------+------------+----------+----------+--------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|   iptu|    valor|Zona Central|Zona Norte|Zona Oeste|Zona Sul|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+-------+---------+------------+----------+----------+--------+
|4e47e4d4-3326-4eb...|    0|     90.0|        2|          [Elevador]|      3|     1|   0|          Copacabana|    Zona Sul|     950.0| 2677.0| 949020.0|           0|         0|         0|       1|
|02fba6ef-a691-442...|    3|     64.0|        1|[Academia, Churra...|      2|     2|   1|         Jacarepaguá|  Zona Oeste|     784.0|   80.0| 380000.0|           0|         0|         1|       0|
|fc03c1a9-8bbb-

# Dummy Classifier na coluna Caracteristicas

In [82]:
df_join.select(f.explode('caracteristicas')).distinct().show()

+------------------+
|               col|
+------------------+
|Condomínio fechado|
|        Playground|
| Portão eletrônico|
|           Piscina|
|Animais permitidos|
|      Portaria 24h|
|          Elevador|
|          Academia|
|   Salão de festas|
|     Churrasqueira|
+------------------+



In [84]:
df_array =  df_join.withColumn("condominio_fechado",f.when(f.array_contains(f.col("caracteristicas"),"Condomínio fechado") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("playground",f.when(f.array_contains(f.col("caracteristicas"),"Playground") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("portao_eletronico",f.when(f.array_contains(f.col("caracteristicas"),"Portão eletrônico") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("piscina",f.when(f.array_contains(f.col("caracteristicas"),"Piscina") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("animais_permitidos",f.when(f.array_contains(f.col("caracteristicas"),"Animais permitidos") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("portaria_24h",f.when(f.array_contains(f.col("caracteristicas"),"Portaria 24h") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("elevador",f.when(f.array_contains(f.col("caracteristicas"),"Elevador") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("academia",f.when(f.array_contains(f.col("caracteristicas"),"Academia") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("salao_de_festas",f.when(f.array_contains(f.col("caracteristicas"),"Salão de festas") == 'true',f.lit(1)).otherwise(0))\
                   .withColumn("churrasqueira",f.when(f.array_contains(f.col("caracteristicas"),"Churrasqueira") == 'true',f.lit(1)).otherwise(0))
df_array.show()

+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+-------+---------+------------+----------+----------+--------+------------------+----------+-----------------+-------+------------------+------------+--------+--------+---------------+-------------+
|                  id|andar|area_util|banheiros|     caracteristicas|quartos|suites|vaga|              bairro|        zona|condominio|   iptu|    valor|Zona Central|Zona Norte|Zona Oeste|Zona Sul|condominio_fechado|playground|portao_eletronico|piscina|animais_permitidos|portaria_24h|elevador|academia|salao_de_festas|churrasqueira|
+--------------------+-----+---------+---------+--------------------+-------+------+----+--------------------+------------+----------+-------+---------+------------+----------+----------+--------+------------------+----------+-----------------+-------+------------------+------------+--------+--------+---------------+-------------+
|

# Dropando ultimas colunas

In [85]:
df_final = df_array.drop('caracteristicas','bairro','zona')
df_final.show()

+--------------------+-----+---------+---------+-------+------+----+----------+-------+---------+------------+----------+----------+--------+------------------+----------+-----------------+-------+------------------+------------+--------+--------+---------------+-------------+
|                  id|andar|area_util|banheiros|quartos|suites|vaga|condominio|   iptu|    valor|Zona Central|Zona Norte|Zona Oeste|Zona Sul|condominio_fechado|playground|portao_eletronico|piscina|animais_permitidos|portaria_24h|elevador|academia|salao_de_festas|churrasqueira|
+--------------------+-----+---------+---------+-------+------+----+----------+-------+---------+------------+----------+----------+--------+------------------+----------+-----------------+-------+------------------+------------+--------+--------+---------------+-------------+
|4e47e4d4-3326-4eb...|    0|     90.0|        2|      3|     1|   0|     950.0| 2677.0| 949020.0|           0|         0|         0|       1|                 0|      

# Vetorização dos dados

In [114]:
from pyspark.ml.feature import VectorAssembler

x = ['andar',
 'area_util',
 'banheiros',
 'quartos',
 'suites',
 'vaga',
 'condominio',
 'iptu',
 'Zona Central',
 'Zona Norte',
 'Zona Oeste',
 'Zona Sul',
 'playground',
 'portao_eletronico',
 'piscina',
 'animais_permitidos',
 'portaria_24h',
 'elevador',
 'academia',
 'salao_de_festas',
 'churrasqueira']

vect_features = VectorAssembler(inputCols = x, outputCol = 'features')
imoves_vect = vect_features.transform(df_final)

imoves_vect.show(truncate=False)



+------------------------------------+-----+---------+---------+-------+------+----+----------+-------+---------+------------+----------+----------+--------+------------------+----------+-----------------+-------+------------------+------------+--------+--------+---------------+-------------+------------------------------------------------------------------------------------------------+
|id                                  |andar|area_util|banheiros|quartos|suites|vaga|condominio|iptu   |valor    |Zona Central|Zona Norte|Zona Oeste|Zona Sul|condominio_fechado|playground|portao_eletronico|piscina|animais_permitidos|portaria_24h|elevador|academia|salao_de_festas|churrasqueira|features                                                                                        |
+------------------------------------+-----+---------+---------+-------+------+----+----------+-------+---------+------------+----------+----------+--------+------------------+----------+-----------------+-------+-----