# ZAP Challenge - Final predictor

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd
import numpy as np
import json
import re

from geopy.geocoders import Nominatim

from pandas.io.json import json_normalize

from sklearn.preprocessing import PolynomialFeatures

from scipy import stats

from joblib import dump, load

In [4]:
houses = []

for line in open('/content/drive/My Drive/zap_challenge/data/source-4-ds-test.json', 'r'):
    houses.append(json.loads(line))
    
data = json_normalize(houses)
data.head()

Unnamed: 0,address.city,address.country,address.district,address.geoLocation.location.lat,address.geoLocation.location.lon,address.geoLocation.precision,address.locationId,address.neighborhood,address.state,address.street,address.streetNumber,address.unitNumber,address.zipCode,address.zone,bathrooms,bedrooms,createdAt,description,id,images,listingStatus,owner,parkingSpaces,pricingInfos.businessType,pricingInfos.monthlyCondoFee,pricingInfos.period,pricingInfos.price,pricingInfos.rentalTotalPrice,pricingInfos.yearlyIptu,publicationType,publisherId,suites,title,totalAreas,unitTypes,updatedAt,usableAreas
0,São Paulo,BR,,-23.557225,-46.662765,GEOMETRIC_CENTER,BR>Sao Paulo>NULL>Sao Paulo>Centro>Consolacao,Consolação,São Paulo,Rua Bela Cintra,,,1415000,Centro,1.0,1,2015-10-20T20:52:41Z,Apartamentos de 1 dormitório na Rua Bela Cintr...,89224365f8,[https://s3-sa-east-1.amazonaws.com/vr.images....,ACTIVE,False,1.0,SALE,,,,,,STANDARD,967d57ce20,0.0,Apartamento Bela Cintra,47.0,APARTMENT,2018-11-08T15:02:53.953Z,47.0
1,São Paulo,BR,,-23.592852,-46.581879,ROOFTOP,BR>Sao Paulo>NULL>Sao Paulo>Zona Leste>Quinta ...,Quinta da Paineira,São Paulo,Rua Bruno Cavalcanti Feder,100.0,,3152155,Zona Leste,0.0,2,2018-07-31T06:10:07.427Z,"Ótima localização, próximo ao shopping Central...",363731333f,[http://images.ingaiasites.com.br/AolwiwJLLpET...,ACTIVE,False,1.0,SALE,0.0,,,,0.0,STANDARD,bddebf057a,0.0,"Apartamento residencial à venda, Quinta da Pai...",55.0,APARTMENT,2018-11-08T16:10:49.374Z,55.0
2,São Paulo,,,-23.493609,-46.638456,ROOFTOP,BR>Sao Paulo>NULL>Sao Paulo>Zona Norte>Santa T...,Chora Menino,São Paulo,Rua Copacabana,313.0,,2461000,,3.0,3,2018-01-25T13:57:14.203Z,Apartamento maravilhoso com ampla sala ( abriu...,6e6283378a,[https://ssl-w08cnn0135.websiteseguro.com/mira...,ACTIVE,False,2.0,SALE,686.0,,,,,STANDARD,d7190e8f4c,1.0,"Apartamento em Santa Terezinha - São Paulo, SP",,APARTMENT,2019-02-12T18:29:26.933Z,92.0
3,São Paulo,,,-23.607981,-46.68618,GEOMETRIC_CENTER,BR>Sao Paulo>NULL>Sao Paulo>Zona Sul>Brooklin,Brooklin Paulista,São Paulo,Rua Guararapes,,,4561003,,4.0,3,2018-04-27T11:33:34.096Z,"Apartamento residencial à venda, Brooklin Paul...",4c29a27f44,[http://images.ingaiasites.com.br/3hLtXgzu4KQj...,ACTIVE,False,3.0,SALE,770.0,,,,368.0,STANDARD,316d75f06f,3.0,Apartamento reformado com varanda gourmet no B...,145.0,APARTMENT,2019-02-24T23:34:29.306Z,145.0
4,São Paulo,,,-23.540604,-46.715088,ROOFTOP,BR>Sao Paulo>NULL>Sao Paulo>Zona Oeste>Alto de...,Alto de Pinheiros,São Paulo,Rua Pio XI,2174.0,,5468140,,2.0,2,2018-09-16T00:58:01.666Z,,7b16cf224b,[http://cdn1.valuegaia.com.br/watermark/agenci...,ACTIVE,False,1.0,SALE,472.0,,,,79.0,STANDARD,295eddde1a,1.0,"Apartamento Residencial à venda, Alto de Pinhe...",76.0,APARTMENT,2019-02-23T00:33:46.237Z,76.0


### Remoção de features não utilizadas

In [0]:
dataset = data.drop(columns=['id','images','title','description','pricingInfos.price','pricingInfos.period','pricingInfos.yearlyIptu', 'updatedAt','createdAt','address.country','address.city','address.geoLocation.precision','address.locationId','address.state','address.streetNumber','address.unitNumber','address.zipCode','address.zone','publisherId','publicationType','address.district','listingStatus','owner', 'pricingInfos.rentalTotalPrice'])

### Preenchimento de campos nulos

In [0]:
dataset['suites'].fillna(0, inplace=True)
dataset['parkingSpaces'].fillna(0, inplace=True)
dataset['bedrooms'].fillna(0, inplace=True)
dataset['bathrooms'].fillna(0, inplace=True)
dataset['pricingInfos.monthlyCondoFee'].fillna(1, inplace=True)
dataset['pricingInfos.monthlyCondoFee'].replace(0, 1, inplace=True)
dataset['address.geoLocation.location.lat'].fillna(0, inplace=True)
dataset['address.geoLocation.location.lon'].fillna(0, inplace=True)

### Preenchimento de coordenadas geograficas em branco

In [0]:
pos_zero = dataset[dataset['address.geoLocation.location.lon'] == 0]

for index, row in pos_zero.iterrows():
  lat_mean = dataset[dataset['address.neighborhood']==row['address.neighborhood']]['address.geoLocation.location.lat'].mean()
  lon_mean = dataset[dataset['address.neighborhood']==row['address.neighborhood']]['address.geoLocation.location.lon'].mean()
  dataset.loc[index, 'address.geoLocation.location.lat'] = lat_mean
  dataset.loc[index, 'address.geoLocation.location.lon'] = lon_mean

dataset.drop(columns=['address.neighborhood','address.street'], inplace=True)

### Preenchimento de areas em branco

In [0]:
dataset['usableAreas'] = dataset['usableAreas'].fillna(dataset['totalAreas'])

def fill_zero(row):
  if row['usableAreas'] == 0:
    row['usableAreas'] = row['totalAreas']

  return row

dataset = dataset.apply(lambda row: fill_zero(row), axis=1)
dataset.drop(columns=['totalAreas'], inplace=True)
dataset['usableAreas'].fillna(1, inplace=True)
dataset['usableAreas'].replace(0, 1, inplace=True)

### Variaveis categoricas

In [9]:
dataset = pd.get_dummies(dataset)
dataset.head()

Unnamed: 0,address.geoLocation.location.lat,address.geoLocation.location.lon,bathrooms,bedrooms,parkingSpaces,pricingInfos.monthlyCondoFee,suites,usableAreas,pricingInfos.businessType_SALE,unitTypes_APARTMENT
0,-23.557225,-46.662765,1.0,1,1.0,1.0,0.0,47.0,1,1
1,-23.592852,-46.581879,0.0,2,1.0,1.0,0.0,55.0,1,1
2,-23.493609,-46.638456,3.0,3,2.0,686.0,1.0,92.0,1,1
3,-23.607981,-46.68618,4.0,3,3.0,770.0,3.0,145.0,1,1
4,-23.540604,-46.715088,2.0,2,1.0,472.0,1.0,76.0,1,1


### Variaveis continuas

In [10]:
columns = ['usableAreas', 'suites', 'parkingSpaces', 'bedrooms', 'bathrooms', 'pricingInfos.monthlyCondoFee']

poly = PolynomialFeatures(2)
cols = dataset[columns]

xp = poly.fit_transform(cols)
pcols = poly.get_feature_names(cols.columns)

polynomials = pd.DataFrame(xp, columns=pcols)
polynomials['usableAreas_log'] = np.log(polynomials['usableAreas'])
polynomials['pricingInfos.monthlyCondoFee_log'] = np.log(polynomials['pricingInfos.monthlyCondoFee'])
polynomials = polynomials.drop(columns=columns, axis=1)
polynomials.head()

for col in dataset.columns.values:
  polynomials[col] = dataset[col].values

polynomials.head()

Unnamed: 0,1,usableAreas^2,usableAreas suites,usableAreas parkingSpaces,usableAreas bedrooms,usableAreas bathrooms,usableAreas pricingInfos.monthlyCondoFee,suites^2,suites parkingSpaces,suites bedrooms,suites bathrooms,suites pricingInfos.monthlyCondoFee,parkingSpaces^2,parkingSpaces bedrooms,parkingSpaces bathrooms,parkingSpaces pricingInfos.monthlyCondoFee,bedrooms^2,bedrooms bathrooms,bedrooms pricingInfos.monthlyCondoFee,bathrooms^2,bathrooms pricingInfos.monthlyCondoFee,pricingInfos.monthlyCondoFee^2,usableAreas_log,pricingInfos.monthlyCondoFee_log,address.geoLocation.location.lat,address.geoLocation.location.lon,bathrooms,bedrooms,parkingSpaces,pricingInfos.monthlyCondoFee,suites,usableAreas,pricingInfos.businessType_SALE,unitTypes_APARTMENT
0,1.0,2209.0,0.0,47.0,47.0,47.0,47.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.850148,0.0,-23.557225,-46.662765,1.0,1,1.0,1.0,0.0,47.0,1,1
1,1.0,3025.0,0.0,55.0,110.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,4.0,0.0,2.0,0.0,0.0,1.0,4.007333,0.0,-23.592852,-46.581879,0.0,2,1.0,1.0,0.0,55.0,1,1
2,1.0,8464.0,92.0,184.0,276.0,276.0,63112.0,1.0,2.0,3.0,3.0,686.0,4.0,6.0,6.0,1372.0,9.0,9.0,2058.0,9.0,2058.0,470596.0,4.521789,6.530878,-23.493609,-46.638456,3.0,3,2.0,686.0,1.0,92.0,1,1
3,1.0,21025.0,435.0,435.0,435.0,580.0,111650.0,9.0,9.0,9.0,12.0,2310.0,9.0,9.0,12.0,2310.0,9.0,12.0,2310.0,16.0,3080.0,592900.0,4.976734,6.646391,-23.607981,-46.68618,4.0,3,3.0,770.0,3.0,145.0,1,1
4,1.0,5776.0,76.0,76.0,152.0,152.0,35872.0,1.0,1.0,2.0,2.0,472.0,1.0,2.0,2.0,472.0,4.0,4.0,944.0,4.0,944.0,222784.0,4.330733,6.156979,-23.540604,-46.715088,2.0,2,1.0,472.0,1.0,76.0,1,1


### Adaptação das features

Os dados do dataset de test não têm muitas das caracteristicas categoricas presentes no dataset de treino da qual foi usada pra gerar o modelo, neste caso será necessário completar o shape do input para utilizar o modelo.

In [11]:
train_columns = ['1', 'usableAreas^2', 'usableAreas suites', 'usableAreas parkingSpaces',
       'usableAreas bedrooms', 'usableAreas bathrooms',
       'usableAreas pricingInfos.monthlyCondoFee', 'suites^2',
       'suites parkingSpaces', 'suites bedrooms', 'suites bathrooms',
       'suites pricingInfos.monthlyCondoFee', 'parkingSpaces^2',
       'parkingSpaces bedrooms', 'parkingSpaces bathrooms',
       'parkingSpaces pricingInfos.monthlyCondoFee', 'bedrooms^2',
       'bedrooms bathrooms', 'bedrooms pricingInfos.monthlyCondoFee',
       'bathrooms^2', 'bathrooms pricingInfos.monthlyCondoFee',
       'pricingInfos.monthlyCondoFee^2', 'usableAreas_log',
       'pricingInfos.monthlyCondoFee_log', 'address.geoLocation.location.lat',
       'address.geoLocation.location.lon', 'bathrooms', 'bedrooms',
       'parkingSpaces', 'pricingInfos.monthlyCondoFee', 'pricingInfos.price',
       'suites', 'usableAreas', 'pricingInfos.businessType_SALE',
       'unitTypes_APARTMENT', 'unitTypes_BUSINESS', 'unitTypes_CLINIC',
       'unitTypes_COMMERCIAL_ALLOTMENT_LAND', 'unitTypes_COMMERCIAL_PROPERTY',
       'unitTypes_CONDOMINIUM', 'unitTypes_COUNTRY_HOUSE', 'unitTypes_FARM',
       'unitTypes_FLAT', 'unitTypes_HOME', 'unitTypes_KITNET',
       'unitTypes_OFFICE', 'unitTypes_PENTHOUSE',
       'unitTypes_RESIDENTIAL_ALLOTMENT_LAND',
       'unitTypes_RESIDENTIAL_BUILDING', 'unitTypes_SHED_DEPOSIT_WAREHOUSE',
       'unitTypes_STORE', 'unitTypes_TWO_STORY_HOUSE']


test_data = pd.DataFrame(columns=train_columns)

for index, row in polynomials.iterrows():
  test_data.loc[index] = row

test_data.fillna(0, inplace=True)
test_data.head()

Unnamed: 0,1,usableAreas^2,usableAreas suites,usableAreas parkingSpaces,usableAreas bedrooms,usableAreas bathrooms,usableAreas pricingInfos.monthlyCondoFee,suites^2,suites parkingSpaces,suites bedrooms,suites bathrooms,suites pricingInfos.monthlyCondoFee,parkingSpaces^2,parkingSpaces bedrooms,parkingSpaces bathrooms,parkingSpaces pricingInfos.monthlyCondoFee,bedrooms^2,bedrooms bathrooms,bedrooms pricingInfos.monthlyCondoFee,bathrooms^2,bathrooms pricingInfos.monthlyCondoFee,pricingInfos.monthlyCondoFee^2,usableAreas_log,pricingInfos.monthlyCondoFee_log,address.geoLocation.location.lat,address.geoLocation.location.lon,bathrooms,bedrooms,parkingSpaces,pricingInfos.monthlyCondoFee,pricingInfos.price,suites,usableAreas,pricingInfos.businessType_SALE,unitTypes_APARTMENT,unitTypes_BUSINESS,unitTypes_CLINIC,unitTypes_COMMERCIAL_ALLOTMENT_LAND,unitTypes_COMMERCIAL_PROPERTY,unitTypes_CONDOMINIUM,unitTypes_COUNTRY_HOUSE,unitTypes_FARM,unitTypes_FLAT,unitTypes_HOME,unitTypes_KITNET,unitTypes_OFFICE,unitTypes_PENTHOUSE,unitTypes_RESIDENTIAL_ALLOTMENT_LAND,unitTypes_RESIDENTIAL_BUILDING,unitTypes_SHED_DEPOSIT_WAREHOUSE,unitTypes_STORE,unitTypes_TWO_STORY_HOUSE
0,1.0,2209.0,0.0,47.0,47.0,47.0,47.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.850148,0.0,-23.557225,-46.662765,1.0,1.0,1.0,1.0,0.0,0.0,47.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,3025.0,0.0,55.0,110.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,4.0,0.0,2.0,0.0,0.0,1.0,4.007333,0.0,-23.592852,-46.581879,0.0,2.0,1.0,1.0,0.0,0.0,55.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,8464.0,92.0,184.0,276.0,276.0,63112.0,1.0,2.0,3.0,3.0,686.0,4.0,6.0,6.0,1372.0,9.0,9.0,2058.0,9.0,2058.0,470596.0,4.521789,6.530878,-23.493609,-46.638456,3.0,3.0,2.0,686.0,0.0,1.0,92.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,21025.0,435.0,435.0,435.0,580.0,111650.0,9.0,9.0,9.0,12.0,2310.0,9.0,9.0,12.0,2310.0,9.0,12.0,2310.0,16.0,3080.0,592900.0,4.976734,6.646391,-23.607981,-46.68618,4.0,3.0,3.0,770.0,0.0,3.0,145.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,5776.0,76.0,76.0,152.0,152.0,35872.0,1.0,1.0,2.0,2.0,472.0,1.0,2.0,2.0,472.0,4.0,4.0,944.0,4.0,944.0,222784.0,4.330733,6.156979,-23.540604,-46.715088,2.0,2.0,1.0,472.0,0.0,1.0,76.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Carregamento do modelo gerado

In [0]:
MODEL_GBR_PATH = '/content/drive/My Drive/zap_challenge/models/gbr.joblib'
MODEL_LASSO_PATH = '/content/drive/My Drive/zap_challenge/models/lasso.joblib'
MODEL_ENET_PATH = '/content/drive/My Drive/zap_challenge/models/enet.joblib'

gbr = load(MODEL_GBR_PATH)
lasso = load(MODEL_LASSO_PATH)
enet = load(MODEL_ENET_PATH)

### Teste interativo

In [44]:
i = 4
x = [test_data.values[i]]
y_gbr = gbr.predict(x)
y_lasso = lasso.predict(x)
y_enet = enet.predict(x)

print(dataset.iloc[i])
print()
print('Pred from gbr:\t\t{}'.format(int(np.exp(y_gbr))))
print('Pred from lasso:\t{}'.format(int(np.exp(y_lasso))))
print('Pred from enet:\t\t{}'.format(int(np.exp(y_enet))))

address.geoLocation.location.lat    -23.540604
address.geoLocation.location.lon    -46.715088
bathrooms                             2.000000
bedrooms                              2.000000
parkingSpaces                         1.000000
pricingInfos.monthlyCondoFee        472.000000
suites                                1.000000
usableAreas                          76.000000
pricingInfos.businessType_SALE        1.000000
unitTypes_APARTMENT                   1.000000
Name: 4, dtype: float64

Pred from gbr:		12825
Pred from lasso:	390944
Pred from enet:		141353


### O porquê gosto do teste interativo.

Os resultados do modelo de Gradient Boosting Regressor pareciam muito promissores durante seu treino e validação, entretanto, após o teste interativo no dataset de teste, o modelo Lasso mostrou predições que parecem fazer mais sentido que as predições do modelo de Boosting. Utilizarei os resultados do preditor Lasso para a submissão. 

### Predição dos dados de teste

In [0]:
predictions = data[['id']].copy()
predictions['price'] = np.exp(lasso.predict(test_data.values))
predictions.to_csv('/content/drive/My Drive/zap_challenge/predictions.csv', index=False)