In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_percentage_error
import seaborn as sns
import matplotlib as plt
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
#from yellowbrick.contrib.wrapper import wrap
#from yellowbrick.regressor import ResidualsPlot

In [34]:
df = pd.read_csv("intermediate_data.csv")

In [35]:
numerical_features = ["area", "rooms", "bathrooms", "garages"]
categorical_features = ["bairro"]
binary_features = [
    'Academia', 'Aceita animais', 'Acesso para deficientes', 'Aquecimento',
    'Ar-condicionado', 'Armário embutido', 'Armário embutido no quarto',
    'Armário na cozinha', 'Armário no banheiro', 'Bicicletário',
    'Box blindex', 'Cabeamento estruturado', 'Churrasqueira', 'Cinema',
    'Circuito de segurança', 'Closet', 'Condomínio fechado',
    'Conexão à internet', 'Copa', 'Coworking', 'Cozinha',
    'Cozinha americana', 'Cozinha grande', 'Câmera de segurança',
    'Depósito', 'Elevador', 'Escritório', 'Espaço gourmet', 'Espaço teen',
    'Espaço verde / Parque', "Espelhos d'água", 'Garage band', 'Garagem',
    'Gerador elétrico', 'Gramado', 'Guarita', 'Gás Encanado', 'Interfone',
    'Janela grande', 'Jardim', 'Lareira', 'Lavabo', 'Lavanderia',
    'Mais de um andar', 'Mobiliado', 'Móvel planejado',
    'Perto de vias de acesso', 'Piscina', 'Piscina coberta',
    'Piscina infantil', 'Piscina para adulto', 'Piso frio',
    'Pista de cooper', 'Playground', 'Portaria 24h', 'Portão eletrônico',
    'Próximo a escola', 'Próximo a hospitais', 'Próximo a shopping',
    'Próximo a transporte público', 'Quadra de squash', 'Quadra de tênis',
    'Quadra poliesportiva', 'Quintal', 'Recepção', 'Sala de jantar',
    'Sala de massagem', 'Salão de festas', 'Salão de jogos', 'Sauna',
    'Segurança 24h', 'Serviços pay per use', 'Serviços públicos essenciais',
    'Sistema de alarme', 'Spa', 'TV a cabo', 'Varanda',
    'Varanda fechada com vidro', 'Varanda gourmet', 'Vigia',
    'Vista para a montanha', 'Vista para o mar', 'Área de lazer',
    'Área de serviço']
target = "price"

In [36]:
df.isnull().sum()[:10]

id              0
url             0
header          0
address         0
area          615
rooms          17
bathrooms      13
garages        33
price         628
condo        2846
dtype: int64

In [37]:
df = df.dropna(subset=['area','rooms','bathrooms','garages','price'])

# Data pipeline

In [43]:
scaler =  MinMaxScaler()
data_pipeline = ColumnTransformer([('numerical', scaler, numerical_features)], remainder ='passthrough')

In [44]:
X = df[numerical_features+categorical_features+binary_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=12345)

In [45]:
data_pipeline.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['area', 'rooms', 'bathrooms', 'garages'])])

In [46]:
X_train_transformed = data_pipeline.transform(X_train)
X_test_transformed = data_pipeline.transform(X_test)

In [47]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=numerical_features+categorical_features+binary_features)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=numerical_features+categorical_features+binary_features)

# feature selection

In [48]:
selector = SelectKBest(score_func=mutual_info_regression, k="all")
selector.fit(X_train_transformed[numerical_features+binary_features], y_train)

SelectKBest(k='all',
            score_func=<function mutual_info_regression at 0x00000262176E5670>)

In [49]:
pd.DataFrame(zip(numerical_features + binary_features, selector.scores_), columns=["feature", "score"]).sort_values("score", ascending=False).head(10)

Unnamed: 0,feature,score
0,area,0.971709
2,bathrooms,0.469381
3,garages,0.375392
1,rooms,0.270295
82,Varanda gourmet,0.074091
51,Piscina,0.053435
4,Academia,0.042148
80,Varanda,0.040939
16,Churrasqueira,0.032624
71,Salão de festas,0.028308


# fitting model

### linear regression

In [50]:
scaler = MinMaxScaler()
ohe = OneHotEncoder()
data_pipeline = ColumnTransformer([("numerical", scaler, numerical_features), 
                                   ("categorical", ohe, categorical_features)], 
                                  remainder="passthrough")

In [51]:
X = df[numerical_features + categorical_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12345)

In [52]:
data_pipeline.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['area', 'rooms', 'bathrooms', 'garages']),
                                ('categorical', OneHotEncoder(), ['bairro'])])

In [55]:
X_train_transformed = data_pipeline.transform(X_train)
X_test_transformed = data_pipeline.transform(X_test)

In [56]:
model = LinearRegression()

In [57]:
model.fit(X_train_transformed, y_train)

LinearRegression()

In [68]:
y_pred = model.predict(X_test_transformed)
y_pred_train = model.predict(X_train_transformed)
print([f'{y_pred},n/ {y_pred_train}'])

['[ 201124.89892687  690691.94908666  259868.51347523 ...  202994.70410066\n  583491.34436015 1144484.84841188],n/ [1479257.78191631 1505946.52199466  473358.10779176 ...  410994.26327856\n 1340838.7538735  2418176.99850013]']


In [60]:
mape_lr = mean_absolute_percentage_error(y_test, y_pred)
print(mape_lr)

0.2924482745440746


In [71]:
r2_lr_train = r2_score(y_train, y_pred_train)
print(r2_lr_train)

0.8225902232840017


In [70]:
visualizer = ResidualsPlot(model)

visualizer.fit(X_train_transformed, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test_transformed, y_test)  # Evaluate the model on the test data
visualizer.show()    

NameError: name 'ResidualsPlot' is not defined

# Random forest

In [72]:
model = RandomForestRegressor(max_depth=15)
model.fit(X_train_transformed, y_train)

RandomForestRegressor(max_depth=15)

In [74]:
y_pred = model.predict(X_test_transformed)
y_pred_train = model.predict(X_train_transformed)
print(y_pred,y_pred_train)

[ 359481.7566686   810144.43646632  322818.91191697 ...  302852.50149301
  493238.90703071 1199535.86833333] [1248444.38947099 1531098.2163306   572195.63222026 ...  504545.68744329
 1387866.74703244 2269938.41442059]


In [76]:
mape_rf = mean_absolute_percentage_error(y_test, y_pred)
print(mape_rf)

0.16382071534001633


In [79]:
r2_rf = mean_absolute_percentage_error(y_train,y_pred_train)
print(r2_rf )

0.1304705798090945


In [80]:
visualizer = ResidualsPlot(model)

visualizer.fit(X_train_transformed, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test_transformed, y_test)  # Evaluate the model on the test data
visualizer.show() 

NameError: name 'ResidualsPlot' is not defined

In [81]:
model = XGBRegressor(max_depth=7)
model.fit(X_train_transformed, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [88]:
y_pred = model.predict(X_test_transformed)
y_pred_train = model.predict(X_train_transformed)
print(y_pred,y_pred_train)

[ 352234.4   763421.25  302376.06 ...  161857.62  532844.2  1181990.4 ] [1235208.6 1388302.1  569529.5 ...  508488.1 1299896.2 2282305.5]


In [89]:
mape_xgboost = mean_absolute_percentage_error(y_test, y_pred)
print(mape_xgboost)

0.16052885551231624


In [90]:
r2_xgboost = r2_score(y_test, y_pred)
print(r2_xgboost)

0.8918332816135608


In [91]:
mape_xgboost_train = mean_absolute_percentage_error(y_train, y_pred_train)
print(mape_xgboost_train)

0.12955850015940604


In [92]:
r2_xgboost_train = r2_score(y_train, y_pred_train)
print(r2_xgboost_train)

0.9701074345337902


In [87]:
visualizer = ResidualsPlot(model)

visualizer.fit(X_train_transformed, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test_transformed, y_test)  # Evaluate the model on the test data
visualizer.show()         

NameError: name 'ResidualsPlot' is not defined

# Catboost

In [93]:
scaler = MinMaxScaler()
data_pipeline = ColumnTransformer([("numerical", scaler, numerical_features)], 
                                  remainder="passthrough")

In [94]:
X = df[numerical_features + categorical_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12345)

In [95]:
data_pipeline.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['area', 'rooms', 'bathrooms', 'garages'])])

In [96]:
X_train_transformed = data_pipeline.transform(X_train)
X_test_transformed = data_pipeline.transform(X_test)

In [97]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=numerical_features+categorical_features)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=numerical_features+categorical_features)

In [98]:
model = CatBoostRegressor(cat_features=["bairro"])
model.fit(X_train_transformed, y_train, eval_set=(X_test_transformed, y_test), verbose=False)

<catboost.core.CatBoostRegressor at 0x26217eea340>

In [100]:
y_pred = model.predict(X_test_transformed)
y_pred_train = model.predict(X_train_transformed)
print(y_pred,y_pred_train)


[ 336908.88165439  745303.20460689  309443.47786616 ...  236175.43409446
  534899.34507176 1190655.23259412] [1301636.09633449 1357709.14001602  564242.45062552 ...  454888.68250474
 1356773.13540389 2337052.21161221]


In [101]:
mape_catboost = mean_absolute_percentage_error(y_test, y_pred)
print(y_test, y_pred)

6356      406205.0
6583      766217.0
3602      320000.0
8859      470000.0
13515     199000.0
           ...    
10995     524000.0
8926      725000.0
14333     175000.0
3783      850000.0
12114    1364570.0
Name: price, Length: 2866, dtype: float64 [ 336908.88165439  745303.20460689  309443.47786616 ...  236175.43409446
  534899.34507176 1190655.23259412]


In [102]:
r2_catboost = r2_score(y_test, y_pred)
print(r2_catboost)

0.8818643861453445


In [103]:
mape_catboost_train = mean_absolute_percentage_error(y_train, y_pred_train)
print(mape_catboost_train)

0.1710344853779387


In [104]:
r2_catboost_train = r2_score(y_train, y_pred_train)
print(r2_catboost_train)


0.9318915188703558


In [105]:
wrapped_model = wrap(model)
visualizer = ResidualsPlot(wrapped_model)

visualizer.fit(X_train_transformed, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test_transformed, y_test)  # Evaluate the model on the test data
visualizer.show()         

NameError: name 'wrap' is not defined