In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

df = pd.read_csv("./data/amostra_sp.txt", sep="|")
df.shape
cat_cols = [
    "shp_municipio",
    "shp_bairro_distrito",
    "shp_microarea",
    "tipo_imovel",
]
num_cols = [
    "dormitorios",
    "suites",
    "banheiros",
    "vagas",
    "salas",
]
df.loc[:, "preco_por_m2"] = df.loc[:, "preco_imovel_mediana"] / df.loc[:, "area_util"]
df = df.loc[df["shp_municipio"] == "São Paulo"]
df.loc[:, "preco_por_m2"] = df.loc[:, "preco_por_m2"].replace({np.inf: np.nan})

target = "preco_por_m2"
cols_to_keep = [*cat_cols, *num_cols, target]
df = df.dropna(subset=cols_to_keep)

df_venda = df.loc[df["tipo_transacao"] == "VENDA"][cols_to_keep]
df_aluguel = df.loc[df["tipo_transacao"] == "ALUGUEL"][cols_to_keep]
del df


def prepare_data_to_models(df):
    X = df[[*num_cols, *cat_cols]]
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=perc_test)

    num_pipeline = Pipeline([("std_scaler", StandardScaler())])

    data_pipeline = ColumnTransformer(
        [
            ("numerical", num_pipeline, num_cols),
            ("categorical", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ]
    )

    perc_test = 0.3

    X_train = data_pipeline.fit_transform(X_train)
    X_test = data_pipeline.transform(X_test)
    return (X_train, X_test, y_train, y_test, data_pipeline)


(
    X_train_venda,
    X_test_venda,
    y_train_venda,
    y_test_venda,
    data_pipeline_venda,
) = prepare_data_to_models(df_venda)
(
    X_train_aluguel,
    X_test_aluguel,
    y_train_aluguel,
    y_test_aluguel,
    data_pipeline_aluguel,
) = prepare_data_to_models(df_aluguel)
# xgb_venda = XGBRegressor(
#     n_estimators = round(len(y_train_venda)/10)
# )


# xgb_venda.fit(X_train_venda, y_train_venda)
# y_pred_venda = xgb_venda.predict(X_test_venda)
# MAPE(y_test_venda, y_pred_venda)
xgb_aluguel = XGBRegressor(n_estimators=round(len(y_train_aluguel) / 10))


xgb_aluguel.fit(X_train_aluguel, y_train_aluguel)
y_pred_aluguel = xgb_aluguel.predict(X_test_aluguel)
MAPE(y_test_aluguel, y_pred_aluguel)