REAL STATE 
==========

A partir de um dataset com dados de imóveis (metragem, cidade, bairro, quartos, etc), queremos prever o valor de cada registro usando redes neurais.

O dataset foi obtido a partir de _webscrapping_ em sites de publicação de anúncios 

## Imports

In [142]:
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from keras.layers import Dense, Activation, Dropout, Flatten
from keras.models import Sequential
from keras.callbacks import EarlyStopping

## Config

In [143]:
# Evita mostrar números em formato científico (para melhor visualização)
np.set_printoptions(suppress=True)

# tamanhos padrão e visualização de plots
PLOT_WIDE = (14,7)
PLOT_MEDIUM = (10,8)
sns.set_style("darkgrid")

## Load data

In [144]:
data = pd.read_csv('./../../../../../python/other_python/zap-project/results/imoveis_data.csv')

In [145]:
data.shape

(20709, 12)

In [146]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,size,rooms,wc,garage_spots,price,neighborhood,city,update_time,price_sqm,link
0,0.0,2528927815,130,3,3,2,2550000,Cabo Branco,João Pessoa,2023-08-16 19:33:46,19615.384615,
1,1.0,2641054378,102,3,4,0,777695,Altiplano Cabo Branco,João Pessoa,2023-08-16 19:33:46,7624.460784,
2,2.0,2617664208,66,3,1,1,755000,Ponta do Seixas,João Pessoa,2023-08-16 19:33:46,11439.393939,
3,3.0,2612089563,108,3,3,1,375000,Tambaú,João Pessoa,2023-08-16 19:33:46,3472.222222,
4,4.0,2530653710,57,2,2,1,178000,Ernesto Geisel,João Pessoa,2023-08-16 19:33:46,3122.807018,


## Preprocessing

In [147]:
# Limpar colunas desnecessárias 
data.drop(data.columns[[0, 1, 9, 10, 11]], axis=1, inplace=True)

In [148]:
data.head()

Unnamed: 0,size,rooms,wc,garage_spots,price,neighborhood,city
0,130,3,3,2,2550000,Cabo Branco,João Pessoa
1,102,3,4,0,777695,Altiplano Cabo Branco,João Pessoa
2,66,3,1,1,755000,Ponta do Seixas,João Pessoa
3,108,3,3,1,375000,Tambaú,João Pessoa
4,57,2,2,1,178000,Ernesto Geisel,João Pessoa


In [149]:
# missing values
data.isna().sum()

size            0
rooms           0
wc              0
garage_spots    0
price           0
neighborhood    0
city            0
dtype: int64

In [150]:
# Duplicates
data.drop_duplicates(keep='first', inplace=True)
data.duplicated().sum()

0

In [151]:

# nas características rooms, wc e garage_spots temos alguns valores não numéricos que indicam, por exemplo 2-3 \
# vamos rodar uma função para calcular a média desses valores e ter um resultado mais acurado 
data.rooms.unique()

array(['3', '2', '4', '20', '1', '5', '17', '6', '1 - 2', '2 - 3', '8',
       '13', '3 - 4', '1 - 3'], dtype=object)

In [152]:
# Tratamento de valores numéricos que estão constando como str
# por exemplo: rooms nem sempre é número: 1 - 2

def update_data(i):
    output = None
    try:
        output = int(i)
    except:
        output = np.array(i.split(' - ')).astype(int).mean()
    
    return output

assert update_data('1') == 1, 'Não foi possível castear to int'
assert update_data('2 - 3') == 2.5, 'Não foi possível castear to int'


In [153]:
# Atualizar 3 colunas
for col in ['rooms', 'wc', 'garage_spots']:
    data[col] = data[col].map(lambda x : update_data(x)).astype(float)

data.dtypes, data.head()

(size              int64
 rooms           float64
 wc              float64
 garage_spots    float64
 price             int64
 neighborhood     object
 city             object
 dtype: object,
    size  rooms   wc  garage_spots    price           neighborhood         city
 0   130    3.0  3.0           2.0  2550000            Cabo Branco  João Pessoa
 1   102    3.0  4.0           0.0   777695  Altiplano Cabo Branco  João Pessoa
 2    66    3.0  1.0           1.0   755000        Ponta do Seixas  João Pessoa
 3   108    3.0  3.0           1.0   375000                 Tambaú  João Pessoa
 4    57    2.0  2.0           1.0   178000         Ernesto Geisel  João Pessoa)

In [154]:
# label encoder para cidade e bairro

encode = LabelEncoder()
for col in data.select_dtypes('object'):
    print(col)
    data[col] = encode.fit_transform(data[col])

data.head()

neighborhood
city


Unnamed: 0,size,rooms,wc,garage_spots,price,neighborhood,city
0,130,3.0,3.0,2.0,2550000,23,2
1,102,3.0,4.0,0.0,777695,4,2
2,66,3.0,1.0,1.0,755000,115,2
3,108,3.0,3.0,1.0,375000,137,2
4,57,2.0,2.0,1.0,178000,53,2


## Particionamento da base

In [155]:
y = data['price']
X = data.drop('price', axis=1)

In [156]:
y[:5], X[:5]

  y[:5], X[:5]


(0    2550000
 1     777695
 2     755000
 3     375000
 4     178000
 Name: price, dtype: int64,
    size  rooms   wc  garage_spots  neighborhood  city
 0   130    3.0  3.0           2.0            23     2
 1   102    3.0  4.0           0.0             4     2
 2    66    3.0  1.0           1.0           115     2
 3   108    3.0  3.0           1.0           137     2
 4    57    2.0  2.0           1.0            53     2)

In [157]:
# Normalizar
y /= np.max(y)
y

0        0.017347
1        0.005290
2        0.005136
3        0.002551
4        0.001211
           ...   
20702    0.000471
20705    0.002347
20706    0.003558
20707    0.003197
20708    0.002551
Name: price, Length: 17188, dtype: float64

In [158]:
X /= np.max(X, axis=0)
X

Unnamed: 0,size,rooms,wc,garage_spots,neighborhood,city
0,0.147895,0.15,0.166667,0.04,0.152318,0.333333
1,0.116041,0.15,0.222222,0.00,0.026490,0.333333
2,0.075085,0.15,0.055556,0.02,0.761589,0.333333
3,0.122867,0.15,0.166667,0.02,0.907285,0.333333
4,0.064846,0.10,0.111111,0.02,0.350993,0.333333
...,...,...,...,...,...,...
20702,0.011377,0.10,0.055556,0.02,0.629139,0.333333
20705,0.093288,0.15,0.111111,0.02,0.476821,0.833333
20706,0.068259,0.15,0.166667,0.04,0.582781,0.833333
20707,0.071672,0.15,0.055556,0.02,0.192053,0.833333


In [159]:
# split train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [160]:
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_train:  (13750, 6)
X_test:  (3438, 6)
y_train:  (13750,)
y_test:  (3438,)


In [161]:
# split train, validate
# do conjunto de treino, selecionar 20% para validação
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [162]:
# Reshaping y
# y_train = np.array(y_train).reshape(-1,1)
# y_val = np.array(y_val).reshape(-1,1)
# y_test = np.array(y_test).reshape(-1,1)

In [163]:
print('X_train: ', X_train.shape)
print('X_val: ', X_val.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_val: ', y_val.shape)
print('y_test: ', y_test.shape)


X_train:  (11000, 6)
X_val:  (2750, 6)
X_test:  (3438, 6)
y_train:  (11000,)
y_val:  (2750,)
y_test:  (3438,)


## Treinamento, definição de arquitetura

In [164]:
model = Sequential()

model.add(Flatten())
model.add(Dense(units=6, activation='relu', input_dim=12))
model.add(Dropout(0.2))
model.add(Dense(units=6, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=6, activation='sigmoid'))

## Definição de otimizadores

In [167]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'Adam', metrics=[keras.metrics.MeanSquaredError()])

In [168]:
es = EarlyStopping(monitor='val_loss', min_delta=0.001, \
                   patience=10, verbose=1, mode='auto')

historico = model.fit(X_train,
                      y_train,
                      epochs=100,
                      verbose=1,
                      validation_data=(X_val, y_val),
                      callbacks=[es])


Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 13: early stopping


In [169]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_7 (Flatten)         (None, 6)                 0         
                                                                 
 dense_21 (Dense)            (None, 6)                 42        
                                                                 
 dropout_12 (Dropout)        (None, 6)                 0         
                                                                 
 dense_22 (Dense)            (None, 6)                 42        
                                                                 
 dropout_13 (Dropout)        (None, 6)                 0         
                                                                 
 dense_23 (Dense)            (None, 6)                 42        
                                                                 
Total params: 126 (504.00 Byte)
Trainable params: 126 

In [173]:
y_pred = model.predict(X_test)
y_pred, y_test

  1/108 [..............................] - ETA: 2s



(array([[0.999998  , 0.00000513, 0.00000121, 0.00000223, 0.00000712,
         0.00006001],
        [0.9999998 , 0.00000057, 0.0000001 , 0.00000021, 0.00000087,
         0.00001081],
        [0.99999917, 0.00000217, 0.00000046, 0.00000089, 0.0000031 ,
         0.00003059],
        ...,
        [0.9999984 , 0.00000404, 0.00000093, 0.00000172, 0.00000564,
         0.00004959],
        [1.        , 0.00000001, 0.        , 0.        , 0.00000001,
         0.00000028],
        [1.        , 0.00000007, 0.00000001, 0.00000002, 0.00000011,
         0.00000203]], dtype=float32),
 655      0.002803
 3930     0.003054
 14779    0.003264
 5702     0.002687
 14199    0.001492
            ...   
 13786    0.002650
 13947    0.003912
 2478     0.002003
 5436     0.009905
 18929    0.007211
 Name: price, Length: 3438, dtype: float64)

## Avaliação do modelo

## Regressão