# Desafio IA SERPRO 2020 - MP

Robson de Sousa Martins
________________________________________________________________________________________________________________________

**Página do Desafio:** [https://www.kaggle.com/c/desafioiamp2020/overview](https://www.kaggle.com/c/desafioiamp2020/overview)

# Bibliotecas Utilizadas

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import hashlib
import re
import geopy

from geopy import distance
from math import sqrt
from io import StringIO
from unicodedata import normalize
from datetime import datetime, timedelta
from time import sleep

from sklearn.metrics import mean_squared_error, r2_score, make_scorer

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

In [42]:
# Estimadores
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import LassoLarsIC
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import GammaRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.neural_network import MLPRegressor
from sklearn.cross_decomposition import CCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import LinearSVR

from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor

In [43]:
# Desliga warnings desnecessários
pd.set_option('mode.chained_assignment',None)

# Funções

In [44]:
# Remove os atributos especificados na lista [attrs]
def drop_attr(df_pre, attrs):
    df_pre = df_pre.copy()
    df_pre.drop(attrs,axis=1,inplace=True)
    return df_pre    

In [45]:
# Remove linhas que contém ND
def drop_nd_rows(df_pre):
    df_pre = df_pre.copy()
    for column in df_pre.columns:
        if str(df_pre[column].dtype) not in ['float64', 'int64']:
            df_pre = df_pre[df_pre[column] != 'ND']
    return df_pre

In [46]:
# Expande coluna "amenities"
def expand_amenities(df_pre,df_test):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    # cria dataframes para as novas colunas
    df_amenities = pd.DataFrame(index=df_pre.index)
    df_test_amenities = pd.DataFrame(index=df_test.index)
    # varre dataframe de treino
    for i in df_pre.index:
        # lê cada linha da coluna "amenities" como uma string csv
        reader = csv.reader(StringIO((df_pre.amenities[i])[1:-1]), delimiter=',')
        for row in reader:
            for item in row:
                column = re.sub('[^A-Za-z0-9]+', '_', item)
                # inclui uma nova coluna no dataframe "amenities", valor default para rows=0
                if column not in df_amenities.columns:
                    df_amenities[column] = 0
                if column not in df_test_amenities.columns:
                    df_test_amenities[column] = 0
                # coloca 1 na row porque a 'amenity' está na linha lida
                df_amenities[column][i] = 1        
    # varre dataframe de teste
    for i in df_test.index:
        # lê cada linha da coluna "amenities" como uma string csv
        reader = csv.reader(StringIO((df_test.amenities[i])[1:-1]), delimiter=',')
        for row in reader:
            for item in row:
                column = re.sub('[^A-Za-z0-9]+', '_', item)
                # inclui uma nova coluna no dataframe "amenities", valor default para rows=0
                if column not in df_amenities.columns:
                    df_amenities[column] = 0
                if column not in df_test_amenities.columns:
                    df_test_amenities[column] = 0
                # coloca 1 na row porque a 'amenity' está na linha lida
                df_test_amenities[column][i] = 1        

    # elimina coluna "amenities" original
    df_pre.drop(['amenities'],axis=1,inplace=True)            
    df_test.drop(['amenities'],axis=1,inplace=True)            
    # concatena novas colunas "amenities" expandidas
    df_pre = pd.concat([df_pre,df_amenities],axis=1)
    df_test = pd.concat([df_test,df_test_amenities],axis=1)
    return df_pre, df_test

In [47]:
# Remove acentos
def str_normalize(s):
    return normalize('NFKD',str(s)).encode('ASCII','ignore').decode('ASCII')

# calcula hash (int64) de string
def md5_hash(x):
    encoder = hashlib.md5(str(x).encode('utf-8'))
    return int(encoder.hexdigest()[16:],16) # apenas 64 bits (16 caracteres hex)
    
# Converte os atributos categóricos em quantitativos discretos
# Usa metade de hash md5 como retorno
def cat_to_discrete_by_hash(df_pre,df_test):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    for column in df_pre.columns:
        if str(df_pre[column].dtype) not in ['float64', 'int64']:
            df_pre[column] = df_pre[column].apply(lambda x: md5_hash(str_normalize(x)))
    for column in df_test.columns:
        if str(df_test[column].dtype) not in ['float64', 'int64']:
            df_test[column] = df_test[column].apply(lambda x: md5_hash(str_normalize(x)))
    return df_pre, df_test


# Converte os atributos categóricos em quantitativos discretos
# Usa LabelEncode como conversor
def cat_to_discrete_by_labelencode(df_pre,df_test):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    encoder = LabelEncoder()
    df_all = pd.concat([df_pre,df_test],axis=0,ignore_index=True)
    for column in df_all.columns:
        if str(df_all[column].dtype) not in ['float64', 'int64']:
            df_all[column] = df_all[column].apply(lambda x: str_normalize(x))
            encoder.fit(list(df_all[column].values))
            if column in df_pre.columns:
                df_pre[column] = df_pre[column].apply(lambda x: str_normalize(x))
                df_pre[column] = encoder.transform(list(df_pre[column].values))
            if column in df_test.columns:
                df_test[column] = df_test[column].apply(lambda x: str_normalize(x))
                df_test[column] = encoder.transform(list(df_test[column].values))
    return df_pre, df_test

# Converte os atributos categóricos em quantitativos discretos
# Usa get_dummies como conversor
# Expande categorias como colunas
def cat_to_discrete_by_dummies(df_pre,df_test):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    df_dummies = pd.DataFrame(index=df_pre.index)
    df_dummies_test = pd.DataFrame(index=df_test.index)
    # varre dataframe de treino
    for column in df_pre.columns:
        if str(df_pre[column].dtype) not in ['float64', 'int64']:
            dummies = pd.get_dummies(df_pre[column], columns=[column], prefix=column)
            for c in dummies.columns:
                if c not in df_dummies_test.columns:
                    df_dummies_test[c] = 0
                if c not in df_dummies.columns:
                    df_dummies[c] = 0
                df_dummies[c] = dummies[c]
            # remove coluna original
            df_pre.drop([column],axis=1,inplace=True)
    # varre dataframe de teste
    for column in df_test.columns:
        if str(df_test[column].dtype) not in ['float64', 'int64']:
            dummies = pd.get_dummies(df_test[column], columns=[column], prefix=column)
            for c in dummies.columns:
                if c not in df_dummies.columns:
                    df_dummies[c] = 0
                if c not in df_dummies_test.columns:
                    df_dummies_test[c] = 0
                df_dummies_test[c] = dummies[c]
            # remove coluna original
            df_test.drop([column],axis=1,inplace=True)
    # concatena novas colunas de categorias expandidas
    df_pre = pd.concat([df_pre,df_dummies],axis=1)
    df_test = pd.concat([df_test,df_dummies_test],axis=1)
    return df_pre, df_test

In [48]:
# Normaliza os dados discretos
def data_normalize(scaler,df_pre,df_test):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    # dataframe treino+teste 
    df_a = pd.concat([df_pre,df_test],axis=0)
    df_a.drop(['id','price'],axis=1,inplace=True) #não converte id e price
    # faz fit
    scaler.fit(df_a.values)
    # faz transform no dataframe de treino
    df_X = df_pre.drop(['id','price'],axis=1) #não converte id e price
    df_id = df_pre['id']
    df_price = df_pre['price']
    scaled_X = scaler.transform(df_X.values)
    df_pre = pd.DataFrame(scaled_X,columns=df_X.columns,index=df_X.index)
    df_pre['price'] = df_price
    df_pre['id'] = df_id
    # faz transform no dataframe de teste
    df_X = df_test.drop(['id'],axis=1) #não converte id
    df_id = df_test['id']
    scaled_X = scaler.transform(df_X.values)
    df_test = pd.DataFrame(scaled_X,columns=df_X.columns,index=df_X.index)
    df_test['id'] = df_id
    return df_pre,df_test    

# Normaliza os dados discretos
# Usa MinMaxScaler
def normalize_by_minmax(df_pre,df_test):
    scaler = MinMaxScaler()
    return data_normalize(scaler,df_pre,df_test)

# Normaliza os dados discretos
# Usa RobustScaler
def normalize_by_robust(df_pre,df_test):
    scaler = RobustScaler()
    return data_normalize(scaler,df_pre,df_test)

# Normaliza os dados discretos
# Usa StandardScaler
def normalize_by_standard(df_pre,df_test):
    scaler = StandardScaler()
    return data_normalize(scaler,df_pre,df_test)


In [49]:
# Seleciona atributos com melhor correlação
# 1.0 é a melhor correlação
def reduce_dimension(est,df_pre,df_test,threshold=0.01):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    X = df_pre.drop(['id','price'],axis=1) # elimina id e price de X
    y = df_pre['price'] # y é o price
    est.fit(X,y)
    # dataframe com os atributos classificados por importância
    df_importances = pd.DataFrame({'col':X.columns,
                  'importance':est.feature_importances_}).sort_values('importance', ascending=False)
    # varre dataframe de atributos
    for i in df_importances.index:
        # elimina dos dataframes os atributos mais irrelevantes
        if (df_importances.importance[i] < threshold):
            if df_importances.col[i] in df_pre.columns: # dataframe de treino
                df_pre.drop(df_importances.col[i],axis=1,inplace=True)
            if df_importances.col[i] in df_test.columns: # dataframe de teste
                df_test.drop(df_importances.col[i],axis=1,inplace=True)
    
    return df_pre,df_test,df_importances

In [50]:
# Conserta campo zipcode
def zipcode_process(df_pre):
    df_pre = df_pre.copy()
    for i in df_pre.index:
        s = str(df_pre['zipcode'][i]).strip()
        zipcode = re.search('[\d]{4}[-\s]{1}[\d]{4}',s)
        if zipcode != None:
            r = str(zipcode.group())
            df_pre['zipcode'][i] = r[0:4]+r[5]+'-'+r[6:9]
        else:
            zipcode = re.search('[\d]{5}[-\s]{1}[\d]{3}',s)
            if zipcode != None:
                r = str(zipcode.group())
                df_pre['zipcode'][i] = r
            else:
                zipcode = re.search('[\d]{8}',s)
                if zipcode != None:
                    r = str(zipcode.group())
                    df_pre['zipcode'][i] = r[0:5]+'-'+r[5:8]
                else:
                    zipcode = re.search('[\d]{5}',s)
                    if zipcode != None:
                        r = str(zipcode.group())
                        df_pre['zipcode'][i] = r+'-000'
        s = str(df_pre['zipcode'][i]).strip()
        if len(s) != 9 or s == '00000-000':    
            df_pre['zipcode'][i] = 'ND'
    return df_pre

# Procura bairro pelo cep na lista de cep/bairro    
def search_bairro_from_cep(cep,df_zip):
    try:
        return df_zip.loc[df_zip['zipcode'] == cep].loc[df_zip['neighbourhood'] != 'ND']['neighbourhood'].values[0]
    except:
        return None    

# Procura cep pelo bairro na lista de cep/bairro    
def search_cep_from_bairro(bairro,df_zip):
    try:
        return df_zip.loc[df_zip['neighbourhood'] == bairro].loc[df_zip['zipcode'] != 'ND']['zipcode'].values[0]
    except:
          return None

# Retorna cep,bairro por lat/lng
def address_by_geo(lat,lng):
    global geolocator
    geolocator = geopy.Nominatim(user_agent='desafio-ia-mp_2020')
    sleep(0.1)
    try:
        location = geolocator.reverse((lat,lng))
        cep = location.raw['address']['postcode']
        if len(cep) == 8:
            cep = cep[0:5]+'-'+cep[5:8]
        elif len(cep) == 5:
            cep = cep+'-000'
        bairro = location.raw['address']['suburb'].lower().strip()
        p = re.search('\([^\)]+\)',bairro)
        if p != None:
            bairro = str(p.group())[1:-1].strip()
    except:
        cep = None
        bairro = None
    return cep, bairro

# Busca ceps e bairros faltantes por lat/lng
def fill_nd(df_zip):
    geolocator = geopy.Nominatim(user_agent='desafio-ia-mp_2020')
    k = 0
    for i in df_zip.index:
        k = k + 1
        zipcode = df_zip['zipcode'][i]
        neighbourhood = df_zip['neighbourhood'][i]
        if zipcode == 'ND' or neighbourhood == 'ND':
            bairro = search_bairro_from_cep(zipcode,df_zip)
            cep = search_cep_from_bairro(neighbourhood,df_zip)
            if (zipcode == 'ND' and cep == None) or (neighbourhood == 'ND' and bairro == None):
                lat = df_zip['latitude'][i]
                lng = df_zip['longitude'][i]
                cep, bairro = address_by_geo(lat,lng)
            if zipcode == 'ND' and cep != None and len(cep) == 9:
                print(cep,bairro,'- Get zipcode:',k,'of',len(df_zip.index))
                df_zip['zipcode'][i] = cep
            if neighbourhood == 'ND' and bairro != None and len(bairro) != 0:
                print(cep,bairro,'- Get neighbourhood:',k,'of',len(df_zip.index))
                df_zip['neighbourhood'][i] = bairro
    return df_zip

# Faz uma lista de bairros/cep
def create_zipcode_list(df_pre,df_test,df_zip):
    df_zip = pd.concat([df_pre,df_test],axis=0,ignore_index=True)
    df_zip = pd.DataFrame(df_zip,
                          columns=['neighbourhood','zipcode',
                                   'latitude','longitude'])
    # busca ceps e bairros faltantes
    df_zip = fill_nd(df_zip)
    # elimina repeticoes de zipcode
    df_zip = df_zip.drop_duplicates(subset='zipcode', keep='first')
    return df_zip

# Transforma bairros 'ND' a partir do cep
def zip2neighbourhood(df_pre,df_zip):
    df_pre = df_pre.copy()
    for i in df_pre.index:
        if df_pre['neighbourhood'][i] == 'ND' and df_pre['zipcode'][i] != 'ND':
            n = df_zip.loc[df_zip['zipcode'].str.contains(df_pre['zipcode'][i][0:5])]['neighbourhood'].values
            if len(n) != 0:
                df_pre['neighbourhood'][i] = n[0]
    return df_pre

# Transforma cep 'ND' a partir do bairro
def neighbourhood2zip(df_pre,df_zip):
    df_pre = df_pre.copy()
    for i in df_pre.index:
        if df_pre['zipcode'][i] == 'ND' and df_pre['neighbourhood'][i] != 'ND':
            n = df_zip.loc[df_zip['neighbourhood'] == df_pre['neighbourhood'][i]]['zipcode'].values
            if len(n) != 0:
                df_pre['zipcode'][i] = n[0]
    return df_pre

# Transforma cep 'ND' e bairro 'ND' a partir de lat/lng
def neighbourhood_zip_from_geo(df_pre):
    df_pre = df_pre.copy()
    for i in df_pre.index:
        if df_pre['zipcode'][i] == 'ND' and df_pre['neighbourhood'][i] == 'ND':
            lat = df_pre['latitude'][i]
            lng = df_pre['longitude'][i]
            cep, bairro = address_by_geo(lat,lng)
            if cep != None and len(cep) == 9:
                df_pre['zipcode'][i] = cep
            if bairro != None and len(bairro) != 0:
                df_pre['neighbourhood'][i] = bairro
    return df_pre


# Processa colunas 'zipcode' e 'neighbourhood'
# Preenche dados faltantes, se possivel
def process_zipcode_neighbourhood(df_pre,df_test,df_zip):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    df_zip = df_zip.copy()
    # conserta cep
    df_pre = zipcode_process(df_pre)
    df_test = zipcode_process(df_test)
    # faz uma lista de bairros/cep
    df_zip = create_zipcode_list(df_pre,df_test,df_zip)
    # transforma bairros 'ND' a partir do cep
    df_pre = zip2neighbourhood(df_pre,df_zip)
    df_test = zip2neighbourhood(df_test,df_zip)
    # transforma cep 'ND' a partir do bairro
    df_pre = neighbourhood2zip(df_pre,df_zip)
    df_test = neighbourhood2zip(df_test,df_zip)
    # Transforma cep 'ND' e bairro 'ND' a partir de lat/lng
    df_pre = neighbourhood_zip_from_geo(df_pre)
    df_test = neighbourhood_zip_from_geo(df_test)
    return df_pre, df_test, df_zip

In [51]:
# Calcula distância para o centro do RJ
def distance_to_center(df_pre, lat_field='latitude', lon_field='longitude'):
    # Centro do RJ -22.9005659,-43.1868733
    return distance.distance((-22.9005659,-43.1868733),(df_pre[lat_field],df_pre[lon_field])).km

# Substitui colunas 'latitude' e 'longitude' por 'distance_center'
# 'distance_center': Distância em km do centro do Rio de Janeiro
def process_latitude_longitude(df_pre,df_test):
    df_pre = df_pre.copy()
    df_test = df_test.copy()
    dist = df_pre.apply(distance_to_center, axis=1)
    df_pre['distance_center'] = dist
    dist = df_test.apply(distance_to_center, axis=1)
    df_test['distance_center'] = dist
    # elimina colunas 'latitude' e 'longitude'
    df_pre.drop(['latitude','longitude'],axis=1,inplace=True)            
    df_test.drop(['latitude','longitude'],axis=1,inplace=True)
    return df_pre, df_test

In [52]:
# Treina um estimador, otimiza hiperparâmetros,
# avalia performance (CV) e retorna métricas de desempenho
def build(X,y,est,grid):
    est_name = est.__class__.__name__
    print('************************')
    print('Testando o estimador',est_name,'...')
    print('************************')
    # Otimiza modelos
    # Uso RMSE como métrica
    regr = GridSearchCV(est,grid,scoring='neg_root_mean_squared_error',n_jobs=-1,cv=5,verbose=100) # 5 folds
    regr.fit(X,y)
    # Obtém as métricas de desempenho - o quanto nosso estimador acertou?
    return regr.best_estimator_, est_name, regr.best_score_, regr.best_params_

# Inicialização

In [53]:
# Semente aleatória a ser usada ao longo desse notebook
random_state=2020

# Nome do arquivo fornecido pelo desafio com os dados rotulados para treino
nome_arquivo_com_rotulos_para_treino = '../input/desafioiamp2020/' + 'treino.csv'

# Nome do arquivo fornecido pelo desafio com os dados não rotulados, que deverão ser analisados pelo modelo construído aqui
nome_arquivo_sem_rotulos = '../input/desafioiamp2020/' + 'teste.csv'

# Nome do arquivo que será criado com os rótulos gerados pelo modelo
# Esse é o arquivo se será submetido à página do desafio
nome_arquivo_rotulado_regressor = '../working/' + 'submissao-equipe_{}.csv'

# Carregando os dados

In [54]:
# Carrega os dados da base rotulada
df = pd.read_csv(nome_arquivo_com_rotulos_para_treino, index_col=None, engine='python', sep =';', encoding="utf-8")
print('Total de registros carregados:',len(df))
# Exibe uma amostra dos dados
df.head()

Total de registros carregados: 18661


Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,29727,brux,within an hour,100.0,t,f,ipanema,22420-020,-22.98333,-43.20161,...,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,7,3.0
1,13336,rosane,ND,0.0,f,f,ND,22760,-22.94213,-43.34465,...,0.0,0.0,0.0,0.0,0.0,0.0,f,flexible,1,0.0
2,22192,marco,within a few hours,100.0,f,f,ipanema,22081,-22.98651,-43.19221,...,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,3,0.08
3,25264,fatima,within an hour,100.0,t,t,copacabana,22031-112,-22.96407,-43.18687,...,0.0,0.0,0.0,0.0,0.0,0.0,f,strict_14_with_grace_period,5,0.0
4,23936,giovanni,within a few hours,87.0,f,f,leme,22010-070,-22.96326,-43.17065,...,0.0,0.0,0.0,0.0,0.0,0.0,f,strict_14_with_grace_period,37,0.26


In [55]:
# Carrega os dados da base não rotulada
df_test = pd.read_csv(nome_arquivo_sem_rotulos, index_col=None, engine='python', sep =';', encoding="utf-8")
print('Total de registros carregados:',len(df_test))
# Exibe uma amostra dos dados
df_test.head()

Total de registros carregados: 3293


Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,4,patricia,within a few hours,100.0,t,t,ipanema,22081-020,-22.98816,-43.19359,...,10.0,9.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,1,2.26
1,6,seba,within a day,100.0,f,t,copacabana,22031-112,-22.96681,-43.18657,...,0.0,0.0,0.0,0.0,0.0,0.0,f,moderate,1,0.01
2,21,andrea,within an hour,85.0,f,f,copacabana,22070-011,-22.98152,-43.19018,...,10.0,10.0,10.0,10.0,9.0,9.0,t,strict_14_with_grace_period,6,0.37
3,22,diogo,within an hour,100.0,f,f,copacabana,22071-100,-22.98119,-43.19373,...,9.0,9.0,10.0,10.0,8.0,8.0,t,strict_14_with_grace_period,1,1.18
4,27,josé,within a day,100.0,f,f,copacabana,22031-070,-22.96925,-43.18283,...,10.0,8.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,4,0.24


# Preparando os dados para a predição

### Trata valores faltantes/incorretos

In [56]:
# Processa colunas 'zipcode' e 'neighbourhood'
# Corrige ceps mal formatados
# Busca via api, cep e bairro pela lat/lng, e preenche 'ND'
df_zip = pd.DataFrame()
df,df_test,df_zip = process_zipcode_neighbourhood(df,df_test,df_zip)
# Exibe uma amostra dos dados
df.head()

22760-401 jacarepaguá - Get neighbourhood: 2 of 21954
22735-030 tanque - Get neighbourhood: 10 of 21954
22750-009 jacarepaguá - Get neighbourhood: 18 of 21954
22780-085 jacarepaguá - Get neighbourhood: 22 of 21954
20530-270 tijuca - Get zipcode: 23 of 21954
22031-112 copacabana - Get zipcode: 51 of 21954
22031-112 copacabana - Get zipcode: 52 of 21954
22780-000 barra da tijuca - Get neighbourhood: 55 of 21954
22770-104 jacarepaguá - Get neighbourhood: 57 of 21954
20230-100 lapa - Get zipcode: 64 of 21954
22720-410 taquara - Get neighbourhood: 71 of 21954
22733-001 praça seca - Get neighbourhood: 75 of 21954
22420-020 ipanema - Get zipcode: 78 of 21954
23270-220 campo grande - Get neighbourhood: 90 of 21954
20031-000 centro - Get zipcode: 92 of 21954
22780-000 barra da tijuca - Get neighbourhood: 96 of 21954
20715-310 engenho novo - Get zipcode: 102 of 21954
22641-000 barra da tijuca - Get neighbourhood: 106 of 21954
22790-430 recreio dos bandeirantes - Get zipcode: 107 of 21954
22210-0

21920-225 tauá - Get neighbourhood: 1485 of 21954
22031-112 copacabana - Get zipcode: 1490 of 21954
23045-000 barra da tijuca - Get zipcode: 1495 of 21954
23045-000 barra da tijuca - Get neighbourhood: 1495 of 21954
22793-000 barra da tijuca - Get zipcode: 1497 of 21954
22420-020 ipanema - Get zipcode: 1498 of 21954
23045-000 barra da tijuca - Get neighbourhood: 1503 of 21954
22031-112 copacabana - Get zipcode: 1507 of 21954
20230-030 santa teresa - Get zipcode: 1513 of 21954
22750-006 jacarepaguá - Get neighbourhood: 1534 of 21954
22783-560 vargem pequena - Get neighbourhood: 1538 of 21954
22780-085 camorim - Get neighbourhood: 1542 of 21954
22745-080 jacarepaguá - Get neighbourhood: 1559 of 21954
22710-315 curicica - Get neighbourhood: 1577 of 21954
22770-104 pechincha - Get neighbourhood: 1580 of 21954
22211-200 catete - Get zipcode: 1584 of 21954
22241-160 cosme velho - Get zipcode: 1597 of 21954
22031-112 copacabana - Get zipcode: 1601 of 21954
22031-112 copacabana - Get zipcode: 

22750-009 taquara - Get neighbourhood: 2908 of 21954
20031-000 centro - Get zipcode: 2931 of 21954
22031-112 copacabana - Get zipcode: 2942 of 21954
21825-060 bangu - Get neighbourhood: 2943 of 21954
22785-000 recreio dos bandeirantes - Get neighbourhood: 2957 of 21954
22211-200 catete - Get zipcode: 2971 of 21954
22790-000 barra da tijuca - Get neighbourhood: 2980 of 21954
22430-220 leblon - Get zipcode: 2995 of 21954
22793-000 barra da tijuca - Get zipcode: 3003 of 21954
22763-000 jacarepaguá - Get neighbourhood: 3008 of 21954
22420-020 ipanema - Get zipcode: 3011 of 21954
22641-000 barra da tijuca - Get neighbourhood: 3042 of 21954
21940-000 jardim guanabara - Get neighbourhood: 3047 of 21954
22780-670 curicica - Get neighbourhood: 3057 of 21954
22785-085 vargem grande - Get neighbourhood: 3063 of 21954
21920-225 moneró - Get neighbourhood: 3083 of 21954
22451-264 rocinha - Get zipcode: 3091 of 21954
22793-000 barra da tijuca - Get zipcode: 3095 of 21954
22031-112 copacabana - Get z

22740-570 taquara - Get neighbourhood: 4404 of 21954
22785-190 copacabana - Get zipcode: 4432 of 21954
22785-190 copacabana - Get neighbourhood: 4432 of 21954
22785-190 copacabana - Get neighbourhood: 4458 of 21954
22031-112 copacabana - Get zipcode: 4466 of 21954
22031-112 copacabana - Get zipcode: 4471 of 21954
22793-000 barra da tijuca - Get zipcode: 4490 of 21954
22750-054 botafogo - Get neighbourhood: 4491 of 21954
22765-451 jacarepaguá - Get neighbourhood: 4497 of 21954
22765-007 gardênia azul - Get neighbourhood: 4512 of 21954
22793-000 barra da tijuca - Get zipcode: 4535 of 21954
21940-410 jardim guanabara - Get neighbourhood: 4549 of 21954
22745-005 jacarepaguá - Get neighbourhood: 4551 of 21954
20530-270 tijuca - Get zipcode: 4564 of 21954
23520-660 santa cruz - Get neighbourhood: 4566 of 21954
22031-112 copacabana - Get zipcode: 4595 of 21954
22770-000 barra da tijuca - Get neighbourhood: 4607 of 21954
22753-737 jacarepaguá - Get neighbourhood: 4611 of 21954
22775-112 curici

22720-010 taquara - Get neighbourhood: 6153 of 21954
22750-006 copacabana - Get zipcode: 6191 of 21954
22750-006 copacabana - Get neighbourhood: 6191 of 21954
22750-006 copacabana - Get neighbourhood: 6198 of 21954
22031-112 copacabana - Get zipcode: 6204 of 21954
23032-080 guaratiba - Get neighbourhood: 6205 of 21954
22785-150 botafogo - Get zipcode: 6207 of 21954
22785-150 botafogo - Get neighbourhood: 6207 of 21954
22785-150 botafogo - Get neighbourhood: 6214 of 21954
22290-000 botafogo - Get zipcode: 6227 of 21954
22031-112 copacabana - Get zipcode: 6243 of 21954
22031-112 copacabana - Get zipcode: 6245 of 21954
22793-000 barra da tijuca - Get zipcode: 6263 of 21954
22780-081 barra da tijuca - Get neighbourhood: 6286 of 21954
22031-112 copacabana - Get zipcode: 6289 of 21954
23020-240 barra de guaratiba - Get zipcode: 6300 of 21954
22420-020 ipanema - Get zipcode: 6306 of 21954
22290-000 botafogo - Get zipcode: 6325 of 21954
21840-700 senador camará - Get neighbourhood: 6334 of 219

21320-020 praça seca - Get neighbourhood: 7515 of 21954
20211-010 estacio - Get zipcode: 7535 of 21954
22745-200 jacarepaguá - Get neighbourhood: 7544 of 21954
23042-500 campo grande - Get neighbourhood: 7546 of 21954
22061-020 copacabana - Get neighbourhood: 7548 of 21954
21940-410 jardim guanabara - Get neighbourhood: 7560 of 21954
22290-030 botafogo - Get neighbourhood: 7571 of 21954
22245-040 laranjeiras - Get zipcode: 7588 of 21954
22770-000 barra da tijuca - Get neighbourhood: 7606 of 21954
22770-233 pechincha - Get neighbourhood: 7607 of 21954
20250-450 rio comprido - Get zipcode: 7617 of 21954
22740-010 pechincha - Get neighbourhood: 7629 of 21954
21921-000 jardim carioca - Get neighbourhood: 7634 of 21954
23587-130 cosmos - Get neighbourhood: 7637 of 21954
22780-000 barra da tijuca - Get neighbourhood: 7641 of 21954
22733-003 tanque - Get neighbourhood: 7646 of 21954
22775-005 barra da tijuca - Get neighbourhood: 7653 of 21954
22610-095 vidigal - Get zipcode: 7657 of 21954
224

22755-155 jacarepaguá - Get neighbourhood: 8964 of 21954
22031-112 copacabana - Get zipcode: 8966 of 21954
22753-045 jacarepaguá - Get neighbourhood: 8971 of 21954
21870-000 realengo - Get neighbourhood: 8972 of 21954
21931-576 portuguesa - Get neighbourhood: 8978 of 21954
20031-000 centro - Get zipcode: 8983 of 21954
20530-270 tijuca - Get zipcode: 8990 of 21954
22031-112 copacabana - Get zipcode: 8991 of 21954
20560-070 grajaú - Get zipcode: 8992 of 21954
22793-000 barra da tijuca - Get zipcode: 9002 of 21954
23076-460 campo grande - Get neighbourhood: 9004 of 21954
22770-235 pechincha - Get neighbourhood: 9007 of 21954
22793-000 barra da tijuca - Get zipcode: 9028 of 21954
20785-000 maria da graça - Get zipcode: 9034 of 21954
22420-020 ipanema - Get zipcode: 9064 of 21954
22031-112 copacabana - Get zipcode: 9075 of 21954
22031-112 copacabana - Get zipcode: 9077 of 21954
20230-030 santa teresa - Get zipcode: 9088 of 21954
22031-112 copacabana - Get zipcode: 9096 of 21954
22750-009 ja

22723-002 taquara - Get neighbourhood: 10367 of 21954
22725-740 vargem grande - Get neighbourhood: 10378 of 21954
22420-020 ipanema - Get zipcode: 10385 of 21954
22031-112 copacabana - Get zipcode: 10386 of 21954
22031-112 copacabana - Get zipcode: 10394 of 21954
22420-020 ipanema - Get zipcode: 10401 of 21954
22451-050 gávea - Get zipcode: 10414 of 21954
21866-300 bangu - Get neighbourhood: 10418 of 21954
22031-112 copacabana - Get zipcode: 10423 of 21954
22031-112 copacabana - Get zipcode: 10459 of 21954
20031-000 centro - Get zipcode: 10467 of 21954
22610-290 são conrado - Get zipcode: 10474 of 21954
22725-740 pedra de guaratiba - Get neighbourhood: 10482 of 21954
22221-000 glória - Get zipcode: 10496 of 21954
22730-120 taquara - Get neighbourhood: 10498 of 21954
22420-020 ipanema - Get zipcode: 10499 of 21954
22793-000 barra da tijuca - Get zipcode: 10517 of 21954
22725-740 itanhangá - Get neighbourhood: 10555 of 21954
23087-230 campo grande - Get neighbourhood: 10578 of 21954
2203

21930-100 ribeira - Get neighbourhood: 11869 of 21954
22725-740 camorim - Get neighbourhood: 11870 of 21954
22031-112 copacabana - Get zipcode: 11871 of 21954
22031-112 copacabana - Get zipcode: 11873 of 21954
22725-740 jacarepaguá - Get neighbourhood: 11900 of 21954
22031-112 copacabana - Get zipcode: 11901 of 21954
22725-740 barra da tijuca - Get neighbourhood: 11928 of 21954
22290-000 botafogo - Get zipcode: 11931 of 21954
22610-095 vidigal - Get zipcode: 11978 of 21954
22725-740 jardim guanabara - Get neighbourhood: 11993 of 21954
22725-740 barra da tijuca - Get neighbourhood: 11994 of 21954
22725-740 barra da tijuca - Get neighbourhood: 12008 of 21954
22725-740 barra da tijuca - Get neighbourhood: 12023 of 21954
21931-582 jardim carioca - Get neighbourhood: 12042 of 21954
23570-290 santa cruz - Get neighbourhood: 12057 of 21954
22725-740 barra da tijuca - Get neighbourhood: 12078 of 21954
22031-112 copacabana - Get zipcode: 12088 of 21954
22031-112 copacabana - Get zipcode: 12093 

22765-431 anil - Get neighbourhood: 13449 of 21954
22290-000 botafogo - Get zipcode: 13454 of 21954
23065-200 paciência - Get neighbourhood: 13460 of 21954
22725-740 anil - Get neighbourhood: 13469 of 21954
22610-095 vidigal - Get zipcode: 13470 of 21954
22770-104 jacarepaguá - Get neighbourhood: 13481 of 21954
20510-060 andaraí - Get zipcode: 13483 of 21954
21940-410 jardim guanabara - Get neighbourhood: 13490 of 21954
22793-000 barra da tijuca - Get zipcode: 13494 of 21954
22210-050 flamengo - Get zipcode: 13495 of 21954
22031-112 copacabana - Get zipcode: 13506 of 21954
20530-270 tijuca - Get zipcode: 13510 of 21954
22793-000 barra da tijuca - Get zipcode: 13522 of 21954
20031-000 centro - Get zipcode: 13544 of 21954
22725-740 jacarepaguá - Get neighbourhood: 13547 of 21954
20530-270 tijuca - Get zipcode: 13560 of 21954
22793-000 barra da tijuca - Get zipcode: 13572 of 21954
22725-740 barra da tijuca - Get neighbourhood: 13577 of 21954
22725-740 barra da tijuca - Get neighbourhood: 

22031-112 copacabana - Get zipcode: 14760 of 21954
23032-050 guaratiba - Get neighbourhood: 14781 of 21954
22725-740 gardênia azul - Get neighbourhood: 14784 of 21954
22031-112 copacabana - Get zipcode: 14806 of 21954
22725-740 leblon - Get zipcode: 14818 of 21954
22725-740 leblon - Get neighbourhood: 14818 of 21954
22725-740 paquetá - Get neighbourhood: 14838 of 21954
23017-130 campo grande - Get neighbourhood: 14840 of 21954
21735-200 realengo - Get neighbourhood: 14843 of 21954
22430-220 leblon - Get zipcode: 14850 of 21954
22725-740 praça seca - Get neighbourhood: 14856 of 21954
22725-740 barra da tijuca - Get neighbourhood: 14861 of 21954
22725-740 taquara - Get neighbourhood: 14863 of 21954
23032-050 guaratiba - Get neighbourhood: 14877 of 21954
22793-000 barra da tijuca - Get zipcode: 14881 of 21954
22641-726 itanhangá - Get neighbourhood: 14894 of 21954
22725-740 jardim carioca - Get neighbourhood: 14901 of 21954
22290-000 botafogo - Get zipcode: 14904 of 21954
22725-740 cosmos

22750-006 anil - Get neighbourhood: 16221 of 21954
22610-290 são conrado - Get zipcode: 16223 of 21954
22725-740 barra da tijuca - Get neighbourhood: 16241 of 21954
22031-112 copacabana - Get zipcode: 16250 of 21954
22785-085 vargem grande - Get neighbourhood: 16252 of 21954
22775-112 jacarepaguá - Get neighbourhood: 16264 of 21954
21870-140 bangu - Get neighbourhood: 16269 of 21954
22793-000 barra da tijuca - Get zipcode: 16270 of 21954
22725-740 jacarepaguá - Get neighbourhood: 16276 of 21954
22765-451 anil - Get neighbourhood: 16285 of 21954
20531-570 alto da boa vista - Get neighbourhood: 16299 of 21954
21660-000 guadalupe - Get zipcode: 16302 of 21954
22010-070 leme - Get zipcode: 16303 of 21954
20031-000 centro - Get zipcode: 16316 of 21954
22725-740 jardim guanabara - Get neighbourhood: 16319 of 21954
22610-095 vidigal - Get zipcode: 16325 of 21954
22610-095 vidigal - Get zipcode: 16347 of 21954
22725-740 taquara - Get neighbourhood: 16355 of 21954
22211-200 catete - Get zipcode

22765-451 gardênia azul - Get neighbourhood: 17596 of 21954
22750-006 anil - Get neighbourhood: 17608 of 21954
20941-000 são cristóvão - Get zipcode: 17629 of 21954
21010-290 cordovil - Get zipcode: 17646 of 21954
22031-112 copacabana - Get zipcode: 17647 of 21954
22725-740 jardim guanabara - Get neighbourhood: 17652 of 21954
22793-000 barra da tijuca - Get zipcode: 17660 of 21954
22031-112 copacabana - Get zipcode: 17676 of 21954
22725-740 pechincha - Get neighbourhood: 17688 of 21954
22725-740 barra da tijuca - Get neighbourhood: 17710 of 21954
20530-270 tijuca - Get zipcode: 17720 of 21954
22725-740 santa teresa - Get zipcode: 17730 of 21954
22725-740 santa teresa - Get neighbourhood: 17730 of 21954
22725-740 santa teresa - Get zipcode: 17736 of 21954
22725-740 santa teresa - Get neighbourhood: 17736 of 21954
20230-030 santa teresa - Get zipcode: 17748 of 21954
22290-000 botafogo - Get zipcode: 17758 of 21954
22790-430 recreio dos bandeirantes - Get zipcode: 17788 of 21954
22210-050

22723-625 taquara - Get neighbourhood: 19410 of 21954
22725-740 barra da tijuca - Get neighbourhood: 19422 of 21954
22725-740 catete - Get zipcode: 19428 of 21954
22725-740 catete - Get neighbourhood: 19428 of 21954
22211-200 catete - Get zipcode: 19439 of 21954
22725-740 recreio dos bandeirantes - Get neighbourhood: 19456 of 21954
22031-112 copacabana - Get zipcode: 19460 of 21954
22725-740 jacarepaguá - Get neighbourhood: 19487 of 21954
22725-740 barra da tijuca - Get neighbourhood: 19493 of 21954
22725-740 guaratiba - Get neighbourhood: 19501 of 21954
22725-740 recreio dos bandeirantes - Get neighbourhood: 19503 of 21954
22031-112 copacabana - Get zipcode: 19505 of 21954
22725-740 copacabana - Get zipcode: 19522 of 21954
22725-740 copacabana - Get neighbourhood: 19522 of 21954
22031-112 copacabana - Get zipcode: 19524 of 21954
22725-740 curicica - Get neighbourhood: 19530 of 21954
22793-000 barra da tijuca - Get zipcode: 19531 of 21954
22211-200 catete - Get zipcode: 19535 of 21954


22773-170 cidade de deus - Get neighbourhood: 20528 of 21954
22725-740 cidade de deus - Get neighbourhood: 20529 of 21954
22725-740 barra da tijuca - Get neighbourhood: 20534 of 21954
22725-740 jardim guanabara - Get neighbourhood: 20541 of 21954
22610-290 são conrado - Get zipcode: 20551 of 21954
22290-000 botafogo - Get zipcode: 20555 of 21954
22430-220 leblon - Get zipcode: 20560 of 21954
22725-740 barra da tijuca - Get neighbourhood: 20561 of 21954
22725-740 ipanema - Get neighbourhood: 20585 of 21954
22221-000 glória - Get zipcode: 20610 of 21954
22725-740 santa cruz - Get neighbourhood: 20639 of 21954
22290-000 botafogo - Get zipcode: 20641 of 21954
22793-000 barra da tijuca - Get zipcode: 20643 of 21954
22031-112 copacabana - Get zipcode: 20663 of 21954
22770-235 pechincha - Get neighbourhood: 20691 of 21954
22725-740 vargem grande - Get neighbourhood: 20726 of 21954
22241-160 cosme velho - Get zipcode: 20742 of 21954
22245-040 laranjeiras - Get zipcode: 20743 of 21954
23020-240

Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,29727,brux,within an hour,100.0,t,f,ipanema,22420-020,-22.98333,-43.20161,...,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,7,3.0
1,13336,rosane,ND,0.0,f,f,jacarepaguá,22760-000,-22.94213,-43.34465,...,0.0,0.0,0.0,0.0,0.0,0.0,f,flexible,1,0.0
2,22192,marco,within a few hours,100.0,f,f,ipanema,22081-000,-22.98651,-43.19221,...,10.0,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,3,0.08
3,25264,fatima,within an hour,100.0,t,t,copacabana,22031-112,-22.96407,-43.18687,...,0.0,0.0,0.0,0.0,0.0,0.0,f,strict_14_with_grace_period,5,0.0
4,23936,giovanni,within a few hours,87.0,f,f,leme,22010-070,-22.96326,-43.17065,...,0.0,0.0,0.0,0.0,0.0,0.0,f,strict_14_with_grace_period,37,0.26


In [57]:
# Exibe uma amostra dos dados
df_test.head()

Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,latitude,longitude,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,4,patricia,within a few hours,100.0,t,t,ipanema,22081-020,-22.98816,-43.19359,...,10.0,9.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,1,2.26
1,6,seba,within a day,100.0,f,t,copacabana,22031-112,-22.96681,-43.18657,...,0.0,0.0,0.0,0.0,0.0,0.0,f,moderate,1,0.01
2,21,andrea,within an hour,85.0,f,f,copacabana,22070-011,-22.98152,-43.19018,...,10.0,10.0,10.0,10.0,9.0,9.0,t,strict_14_with_grace_period,6,0.37
3,22,diogo,within an hour,100.0,f,f,copacabana,22071-100,-22.98119,-43.19373,...,9.0,9.0,10.0,10.0,8.0,8.0,t,strict_14_with_grace_period,1,1.18
4,27,josé,within a day,100.0,f,f,copacabana,22031-070,-22.96925,-43.18283,...,10.0,8.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,4,0.24


### Expande colunas agregadas e faz outras transformações

In [58]:
# Processa colunas 'latitude' e 'longitude'
# Transforma colunas lat/lng em uma só coluna: "distância para o centro do RJ, em km"
df,df_test = process_latitude_longitude(df,df_test)
# Exibe uma amostra dos dados
df.head()

Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month,distance_center
0,29727,brux,within an hour,100.0,t,f,ipanema,22420-020,loft,entire home/apt,...,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,7,3.0,9.289355
1,13336,rosane,ND,0.0,f,f,jacarepaguá,22760-000,apartment,private room,...,0.0,0.0,0.0,0.0,0.0,f,flexible,1,0.0,16.826821
2,22192,marco,within a few hours,100.0,f,f,ipanema,22081-000,loft,entire home/apt,...,10.0,10.0,10.0,10.0,10.0,f,strict_14_with_grace_period,3,0.08,9.533453
3,25264,fatima,within an hour,100.0,t,t,copacabana,22031-112,house,private room,...,0.0,0.0,0.0,0.0,0.0,f,strict_14_with_grace_period,5,0.0,7.032639
4,23936,giovanni,within a few hours,87.0,f,f,leme,22010-070,apartment,entire home/apt,...,0.0,0.0,0.0,0.0,0.0,f,strict_14_with_grace_period,37,0.26,7.139576


In [59]:
# Exibe uma amostra dos dados
df_test.head()

Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month,distance_center
0,4,patricia,within a few hours,100.0,t,t,ipanema,22081-020,apartment,entire home/apt,...,9.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,1,2.26,9.724885
1,6,seba,within a day,100.0,f,t,copacabana,22031-112,apartment,private room,...,0.0,0.0,0.0,0.0,0.0,f,moderate,1,0.01,7.336142
2,21,andrea,within an hour,85.0,f,f,copacabana,22070-011,apartment,entire home/apt,...,10.0,10.0,10.0,9.0,9.0,t,strict_14_with_grace_period,6,0.37,8.971528
3,22,diogo,within an hour,100.0,f,f,copacabana,22071-100,apartment,entire home/apt,...,9.0,10.0,10.0,8.0,8.0,t,strict_14_with_grace_period,1,1.18,8.956224
4,27,josé,within a day,100.0,f,f,copacabana,22031-070,apartment,entire home/apt,...,8.0,10.0,10.0,10.0,9.0,f,strict_14_with_grace_period,4,0.24,7.617588


In [60]:
# Expande coluna "amenities"
# Transforma cada amenity em uma coluna categórica
df, df_test = expand_amenities(df, df_test)
# Exibe uma amostra dos dados
df.head()

Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,...,projector_and_screen,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view
0,29727,brux,within an hour,100.0,t,f,ipanema,22420-020,loft,entire home/apt,...,0,0,0,0,0,0,0,0,0,0
1,13336,rosane,ND,0.0,f,f,jacarepaguá,22760-000,apartment,private room,...,0,0,0,0,0,0,0,0,0,0
2,22192,marco,within a few hours,100.0,f,f,ipanema,22081-000,loft,entire home/apt,...,0,0,0,0,0,0,0,0,0,0
3,25264,fatima,within an hour,100.0,t,t,copacabana,22031-112,house,private room,...,0,0,0,0,0,0,0,0,0,0
4,23936,giovanni,within a few hours,87.0,f,f,leme,22010-070,apartment,entire home/apt,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# Exibe uma amostra dos dados
df_test.head()

Unnamed: 0,id,host_name,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,...,projector_and_screen,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view
0,4,patricia,within a few hours,100.0,t,t,ipanema,22081-020,apartment,entire home/apt,...,0,0,0,0,0,0,0,0,0,0
1,6,seba,within a day,100.0,f,t,copacabana,22031-112,apartment,private room,...,0,0,0,0,0,0,0,0,0,0
2,21,andrea,within an hour,85.0,f,f,copacabana,22070-011,apartment,entire home/apt,...,0,0,0,0,0,0,0,0,0,0
3,22,diogo,within an hour,100.0,f,f,copacabana,22071-100,apartment,entire home/apt,...,0,0,0,0,0,0,0,0,0,0
4,27,josé,within a day,100.0,f,f,copacabana,22031-070,apartment,entire home/apt,...,0,0,0,0,0,0,0,0,0,0


### Remove atributos irrelevantes

In [62]:
# remove atributo "host_name"
df = drop_attr(df,['host_name'])
# Exibe uma amostra dos dados
df.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,...,projector_and_screen,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view
0,29727,within an hour,100.0,t,f,ipanema,22420-020,loft,entire home/apt,2,...,0,0,0,0,0,0,0,0,0,0
1,13336,ND,0.0,f,f,jacarepaguá,22760-000,apartment,private room,2,...,0,0,0,0,0,0,0,0,0,0
2,22192,within a few hours,100.0,f,f,ipanema,22081-000,loft,entire home/apt,3,...,0,0,0,0,0,0,0,0,0,0
3,25264,within an hour,100.0,t,t,copacabana,22031-112,house,private room,1,...,0,0,0,0,0,0,0,0,0,0
4,23936,within a few hours,87.0,f,f,leme,22010-070,apartment,entire home/apt,4,...,0,0,0,0,0,0,0,0,0,0


In [63]:
# remove atributo "host_name"
df_test = drop_attr(df_test,['host_name'])
# Exibe uma amostra dos dados
df_test.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,...,projector_and_screen,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view
0,4,within a few hours,100.0,t,t,ipanema,22081-020,apartment,entire home/apt,3,...,0,0,0,0,0,0,0,0,0,0
1,6,within a day,100.0,f,t,copacabana,22031-112,apartment,private room,2,...,0,0,0,0,0,0,0,0,0,0
2,21,within an hour,85.0,f,f,copacabana,22070-011,apartment,entire home/apt,4,...,0,0,0,0,0,0,0,0,0,0
3,22,within an hour,100.0,f,f,copacabana,22071-100,apartment,entire home/apt,12,...,0,0,0,0,0,0,0,0,0,0
4,27,within a day,100.0,f,f,copacabana,22031-070,apartment,entire home/apt,5,...,0,0,0,0,0,0,0,0,0,0


In [64]:
# remove linhas com 'ND' (outliers)
df = drop_nd_rows(df)
print('linhas restantes:',len(df.index))
df.head()

linhas restantes: 13551


Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,...,projector_and_screen,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view
0,29727,within an hour,100.0,t,f,ipanema,22420-020,loft,entire home/apt,2,...,0,0,0,0,0,0,0,0,0,0
2,22192,within a few hours,100.0,f,f,ipanema,22081-000,loft,entire home/apt,3,...,0,0,0,0,0,0,0,0,0,0
3,25264,within an hour,100.0,t,t,copacabana,22031-112,house,private room,1,...,0,0,0,0,0,0,0,0,0,0
4,23936,within a few hours,87.0,f,f,leme,22010-070,apartment,entire home/apt,4,...,0,0,0,0,0,0,0,0,0,0
5,25472,within a day,75.0,f,f,catete,22211-200,apartment,entire home/apt,4,...,0,0,0,0,0,0,0,0,0,0


### Converte atributos categóricos em quantitativos discretos

In [65]:
# converte os atributos categóricos em quantitativos discretos
df, df_test = cat_to_discrete_by_labelencode(df, df_test)
#df, df_test = cat_to_discrete_by_hash(df, df_test)
#df, df_test = cat_to_discrete_by_dummies(df, df_test)
# Exibe uma amostra dos dados
df.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,...,projector_and_screen,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view
0,29727,4,100.0,2,1,48,1292,25,0,2,...,0,0,0,0,0,0,0,0,0,0
2,22192,3,100.0,1,1,48,991,25,0,3,...,0,0,0,0,0,0,0,0,0,0
3,25264,4,100.0,2,2,22,894,21,2,1,...,0,0,0,0,0,0,0,0,0,0
4,23936,3,87.0,1,1,63,832,1,0,4,...,0,0,0,0,0,0,0,0,0,0
5,25472,2,75.0,1,1,17,1024,1,0,4,...,0,0,0,0,0,0,0,0,0,0


In [66]:
# Exibe uma amostra dos dados
df_test.head()

Unnamed: 0,id,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,...,projector_and_screen,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view
0,4,3,100.0,2,2,48,995,1,0,3,...,0,0,0,0,0,0,0,0,0,0
1,6,2,100.0,1,2,22,894,1,2,2,...,0,0,0,0,0,0,0,0,0,0
2,21,4,85.0,1,1,22,964,1,0,4,...,0,0,0,0,0,0,0,0,0,0
3,22,4,100.0,1,1,22,976,1,0,12,...,0,0,0,0,0,0,0,0,0,0
4,27,2,100.0,1,1,22,887,1,0,5,...,0,0,0,0,0,0,0,0,0,0


### Normaliza atributos

In [67]:
# escala/normaliza os atributos
df, df_test = normalize_by_minmax(df, df_test)
#df, df_test = normalize_by_robust(df, df_test)
#df, df_test = normalize_by_standard(df, df_test)
# Exibe uma amostra dos dados
df.head()

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,...,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view,price,id
0,1.0,1.0,1.0,0.5,0.390244,0.590764,0.757576,0.0,0.090909,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.0,29727
2,0.75,1.0,0.5,0.5,0.390244,0.453132,0.757576,0.0,0.181818,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,149.0,22192
3,1.0,1.0,1.0,1.0,0.178862,0.408779,0.636364,0.666667,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,25264
4,0.75,0.87,0.5,0.5,0.512195,0.38043,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448.0,23936
5,0.5,0.75,0.5,0.5,0.138211,0.468221,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,378.0,25472


In [68]:
# Exibe uma amostra dos dados
df_test.head()

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,...,pillow_top_mattress,outdoor_seating,terrace,mudroom,sun_loungers,high_resolution_computer_monitor,amazon_echo,murphy_bed,beach_view,id
0,0.75,1.0,1.0,1.0,0.390244,0.454961,0.030303,0.0,0.181818,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,0.5,1.0,0.5,1.0,0.178862,0.408779,0.030303,0.666667,0.090909,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
2,1.0,0.85,0.5,0.5,0.178862,0.440786,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
3,1.0,1.0,0.5,0.5,0.178862,0.446273,0.030303,0.0,1.0,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
4,0.5,1.0,0.5,0.5,0.178862,0.405578,0.030303,0.0,0.363636,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27


# Escolhendo, treinando e testando um modelo preditivo

In [69]:
# Algoritmos de predicao
estimators = [
  {'est': XGBRegressor(), 
   'grid':{ 
      'random_state': [random_state],
      'nthread':[4],
      'objective':['reg:squarederror'],
      'learning_rate': [.03],
      #'learning_rate': [.03, 0.05, .07],
      'max_depth': [8],
      #'max_depth': [5, 6, 7],
      'min_child_weight': [4],
      'subsample': [.7],
      'colsample_bytree': [.7],
      'n_estimators': [800],
   },
   'est_name':'', 'rmse':0.0, 'params':{}},
  {'est': LGBMRegressor(), 
   'grid':{ 
      'random_state': [random_state],
      'learning_rate': [.03],
      'num_leaves': [64],
      'objective': ['binary'],
      'n_estimators': [800],
   },
   'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': GradientBoostingRegressor(), 
  # 'grid':{ 
  #    'random_state': [random_state],
  #    'loss': ['ls'],
  #    #'loss': ['ls', 'lad', 'huber', 'quantile'],
  #    'n_estimators': [120],
  #    #'n_estimators': range(20,81,10),
  #    'max_depth': [11], 
  #    #'max_depth':range(5,16,2), 
  #    'min_samples_split': [200], 
  #    #'min_samples_split':range(200,1001,200),
  #    'min_samples_leaf': [90],
  #    #'min_samples_leaf':range(30,71,10),
  # },
  # 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': HistGradientBoostingRegressor(), 
  # 'grid':{ 
  #    'random_state': [random_state],
  #    'max_depth': [15], 
  #    #'max_depth':range(5,16,2), 
  #    'min_samples_leaf': [60],
  #    #'min_samples_leaf':range(30,71,10),
  # },
  # 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': MLPRegressor(), 
  # 'grid':{ 
  #    'random_state': [random_state],
  #    'hidden_layer_sizes': [(50,100,50)],
  #    #'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,1)],
  #    'activation': ['relu'],
  #    #'activation': ['relu','tanh','logistic'],
  #    'alpha': [0.0001],
  #    #'alpha': [0.0001, 0.05],
  #    'learning_rate': ['constant'],
  #    #'learning_rate': ['constant','adaptive'],
  #    'solver': ['adam'],
  #    'max_iter': [100],
  # },
  # 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': LinearRegression(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': RandomForestRegressor(), 'grid':{'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': BaggingRegressor(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': AdaBoostRegressor(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': DecisionTreeRegressor(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': ARDRegression(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': BayesianRidge(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': CCA(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': ElasticNet(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': ExtraTreeRegressor(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': ExtraTreesRegressor(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': GammaRegressor(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': HuberRegressor(), 'grid':{}, 'est_name':'', 'rmse':0.0},
  #{'est': KNeighborsRegressor(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': KernelRidge(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': Lasso(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': Lars(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': LassoLars(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': LassoLarsIC(), 'grid':{}, 'est_name':'', 'rmse':0.0, 'params':{}},
  #{'est': LinearSVR(), 'grid':{ 'random_state': [random_state],}, 'est_name':'', 'rmse':0.0, 'params':{}},
]

In [70]:
X_train = df.drop(['id','price'],axis=1)
y_train = df['price'].tolist()
# Treina/testa estimador
for estimator in estimators:
    estimator['est'], estimator['est_name'], estimator['rmse'], estimator['params'] = build(X_train,y_train,estimator['est'],estimator['grid'])


************************
Testando o estimador XGBRegressor ...
************************
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(197,), dtype=object).
Pickling array (shape=(13551,), dtype=int64).
Memmapping (shape=(197, 13551), dtype=float64) to new file /dev/shm/joblib_memmapping_folder_37_2178951484/37-140054798565072-90677d2833b447d8ae4edae8054268a5.pkl
Pickling array (shape=(197,), dtype=object).
Pickling array (shape=(10840,), dtype=int64).
Pickling array (shape=(2711,), dtype=int64).
Pickling array (shape=(197,), dtype=object).
Pickling array (shape=(13551,), dtype=int64).
Memmapping (shape=(197, 13551), dtype=float64) to old file /dev/shm/joblib_memmapping_folder_37_2178951484/37-140054798565072-90677d2833b447d8ae4edae8054268a5.pkl
Pickling array (shape=(197,), dtype=object).
Pickling array (shape=(10841,), dtype=int64).
Pickling array (shape=(2710,), dtype=int64

In [71]:
# Seleção do melhor estimador
def get_rmse(estimator):
    return estimator.get('rmse')

estimators.sort(key=get_rmse, reverse=True)

for estimator in estimators:
    print(estimator['est_name'],'- RMSE:',estimator['rmse'],'- Params:',estimator['params'])

# escolhe melhor estimador
regr = estimators[0]['est']
print('\nSelecionado: ',estimators[0]['est_name'])

XGBRegressor - RMSE: -72.30764520994543 - Params: {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 800, 'nthread': 4, 'objective': 'reg:squarederror', 'random_state': 2020, 'subsample': 0.7}
LGBMRegressor - RMSE: -230.43653742390015 - Params: {'learning_rate': 0.03, 'n_estimators': 800, 'num_leaves': 64, 'objective': 'binary', 'random_state': 2020}

Selecionado:  XGBRegressor


### Reduz dimensionalidade

In [72]:
# reduz a dimensionalidade (remove atributos de baixa correlação)
df, df_test, df_importances = reduce_dimension(regr, df, df_test, 0.001)
# Exibe uma amostra dos dados
df.head()

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,...,bath_towel,hot_water_kettle,smart_tv,netflix,full_kitchen,bedroom_comforts,ski_in_ski_out,kitchenette,price,id
0,1.0,1.0,1.0,0.5,0.390244,0.590764,0.757576,0.0,0.090909,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.0,29727
2,0.75,1.0,0.5,0.5,0.390244,0.453132,0.757576,0.0,0.181818,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,149.0,22192
3,1.0,1.0,1.0,1.0,0.178862,0.408779,0.636364,0.666667,0.0,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,25264
4,0.75,0.87,0.5,0.5,0.512195,0.38043,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448.0,23936
5,0.5,0.75,0.5,0.5,0.138211,0.468221,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,378.0,25472


In [73]:
# Exibe uma amostra dos dados
df_test.head()

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,...,body_soap,bath_towel,hot_water_kettle,smart_tv,netflix,full_kitchen,bedroom_comforts,ski_in_ski_out,kitchenette,id
0,0.75,1.0,1.0,1.0,0.390244,0.454961,0.030303,0.0,0.181818,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,0.5,1.0,0.5,1.0,0.178862,0.408779,0.030303,0.666667,0.090909,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
2,1.0,0.85,0.5,0.5,0.178862,0.440786,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
3,1.0,1.0,0.5,0.5,0.178862,0.446273,0.030303,0.0,1.0,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22
4,0.5,1.0,0.5,0.5,0.178862,0.405578,0.030303,0.0,0.363636,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27


# Treinando, testando e avaliando o modelo
RMSE: Raiz do erro médio quadrático - É uma métrica que visa medir a diferença, ou erro, entre os valores previstos e os realizados. Quanto menor o erro, melhor é o modelo.

In [74]:
# Treina o modelo com a massa de treino
# Avalia a performance do modelo treinado
X_train = df.drop(['id','price'],axis=1)
y_train = df['price'].tolist()

rmse_scorer = make_scorer(mean_squared_error)

scores = cross_validate(regr, X_train, y_train, cv=5, n_jobs=-1, scoring=rmse_scorer, verbose=100)
print('RMSE:',round(sqrt(scores['test_score'].mean()),6))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(155,), dtype=object).
Pickling array (shape=(13551,), dtype=int64).
Memmapping (shape=(155, 13551), dtype=float64) to new file /dev/shm/joblib_memmapping_folder_37_2178951484/37-140054798565072-63d148d27e8c43c8b93ebb15551f8e71.pkl
Pickling array (shape=(155,), dtype=object).
Pickling array (shape=(10840,), dtype=int64).
Pickling array (shape=(2711,), dtype=int64).
Pickling array (shape=(155,), dtype=object).
Pickling array (shape=(13551,), dtype=int64).
Memmapping (shape=(155, 13551), dtype=float64) to old file /dev/shm/joblib_memmapping_folder_37_2178951484/37-140054798565072-63d148d27e8c43c8b93ebb15551f8e71.pkl
Pickling array (shape=(155,), dtype=object).
Pickling array (shape=(10841,), dtype=int64).
Pickling array (shape=(2710,), dtype=int64).
Pickling array (shape=(155,), dtype=object).
Pickling array (shape=(13551,), dtype=int64).
Memmapping (shape=(155, 13551), dtype=float64) to old

# Analisando os registros não rotulados para o desafio

In [75]:
# Treina o estimador com toda a base fornecida
X_train = df.drop(['id','price'],axis=1)
y_train = df['price'].tolist()
regr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=8,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=800, n_jobs=4, nthread=4, num_parallel_tree=1,
             random_state=2020, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.7, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [76]:
# Prepara os dados para regressão
if 'price' in df_test.columns:
    df_test.drop(['price'],axis=1,inplace=True)

X_test = df_test.drop(['id'],axis=1)
# Exibe uma amostra dos dados
X_test.head()

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,...,bathtub_with_bath_chair,body_soap,bath_towel,hot_water_kettle,smart_tv,netflix,full_kitchen,bedroom_comforts,ski_in_ski_out,kitchenette
0,0.75,1.0,1.0,1.0,0.390244,0.454961,0.030303,0.0,0.181818,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.5,1.0,0.5,1.0,0.178862,0.408779,0.030303,0.666667,0.090909,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.85,0.5,0.5,0.178862,0.440786,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.5,0.5,0.178862,0.446273,0.030303,0.0,1.0,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.5,1.0,0.5,0.5,0.178862,0.405578,0.030303,0.0,0.363636,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
# Executa a predição dos registros não rotulados
y_pred = regr.predict(X_test)
df_test['price'] = y_pred
df_test['price'] = df_test['price'].round(decimals=2)

# Exibe uma amostra dos resultados
df_test.head(10)

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_identity_verified,neighbourhood,zipcode,property_type,room_type,accommodates,bathrooms,...,bath_towel,hot_water_kettle,smart_tv,netflix,full_kitchen,bedroom_comforts,ski_in_ski_out,kitchenette,id,price
0,0.75,1.0,1.0,1.0,0.390244,0.454961,0.030303,0.0,0.181818,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,211.880005
1,0.5,1.0,0.5,1.0,0.178862,0.408779,0.030303,0.666667,0.090909,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,185.059998
2,1.0,0.85,0.5,0.5,0.178862,0.440786,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,158.229996
3,1.0,1.0,0.5,0.5,0.178862,0.446273,0.030303,0.0,1.0,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22,357.320007
4,0.5,1.0,0.5,0.5,0.178862,0.405578,0.030303,0.0,0.363636,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27,275.48999
5,1.0,1.0,0.5,1.0,0.03252,0.919524,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43,297.869995
6,1.0,1.0,0.5,1.0,0.504065,0.604481,0.030303,0.0,0.272727,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46,243.419998
7,0.75,1.0,0.5,0.5,0.178862,0.410151,0.393939,0.0,0.454545,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49,328.309998
8,0.75,0.9,0.5,0.5,0.829268,0.067215,0.636364,0.666667,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55,60.400002
9,1.0,1.0,0.5,1.0,0.357724,0.536808,0.636364,0.666667,0.090909,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74,169.020004


# Submetendo os resultados à página do desafio
O arquivo de resultados deve ser submetido na plataforma de avalição do desafio. Assim que avaliado, a pontuação será exibida no leaderboard.

In [78]:
# Salva os registros
filename = nome_arquivo_rotulado_regressor.format((datetime.now() - timedelta(hours=3)).strftime('%Y-%m-%d_%H-%M-%S'))
print(filename)
df_test.to_csv(filename, index=False, sep=",", encoding="utf-8", columns=['id','price'])

../working/submissao-equipe_2020-10-27_13-49-08.csv
