<p style = "font-size:40px; font-family:Space Mono ; font-weight : normal; background-color: #06093f; color :white   ; text-align: center; border-radius: 5px 5px; padding: 5px"> Exploratory Data Analysis: Housing Prices in Goiania-Goias</p>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")

In [None]:
df = pd.read_csv('/kaggle/input/imoveis-goiniago/2021-08-05-all.csv')
df.info()

# **1. Preprocessing Data**

In [None]:
def convert_money(txt):
  money = txt.strip('R$ ').replace('.', '').replace(',', '.')
  return float(money)

df = df[df['PRICE'] != 'Sob consulta']
df['PRICE'] = df['PRICE'].apply(lambda x : convert_money(x))
df['CONDOMÍNIO'] = df['CONDOMÍNIO'].apply(lambda x : convert_money(x) 
                                          if pd.isnull(x) != True else float(0))
df['IPTU'] = df['IPTU'].apply(lambda x : convert_money(x) 
                                          if pd.isnull(x) != True else np.nan)
for i in ['BEDROOMS','PARKING-SPACES', 'BATHROOMS']:
    df[i] = df[i] = pd.to_numeric(df[i], errors='coerce').astype('Int64')
df.dropna(subset=['ADDRESS', 'AREAS'], inplace=True)

df['AREAS'] = df.AREAS.str.replace(' m²', '').str.split(' - ').apply(lambda x: [int(i) for i in x])
df['AREAS'] = df['AREAS'].apply(np.mean)

## Missing Values

In [None]:
print('Resume Missing Values')
df.isnull().sum().sort_values(ascending=False)

* ### Filling missing values: *fillna*

In [None]:
df.drop(df[df.IPTU >= 75000].index, inplace=True)
df.IPTU.interpolate(limit_direction='both', inplace=True)
for c in df[df.IPTU < 10].TIPO.unique():
    mask = (df['IPTU'] < 10) & (df['TIPO'] == c)
    df.loc[mask, 'IPTU'] = df[df.TIPO == c].IPTU.median()
    
m = (df['BEDROOMS'].isna()) & (df['TIPO'] == 'fazendas-sitios-chacaras')
df.loc[m,'BEDROOMS'] = df.loc[m,'BEDROOMS'].fillna(1)

m1 = (df['BEDROOMS'].isna()) & (df['TIPO'] == 'apartamentos')
df.loc[m1,'BEDROOMS'] = df.loc[m1,'BEDROOMS'].fillna(3)

m2 = (df['BEDROOMS'].isna()) & (df['TIPO'] == 'casas')
df.loc[m2,'BEDROOMS'] = df.loc[m2,'BEDROOMS'].fillna(3)

m3 = (df['BEDROOMS'].isna()) & (df['TIPO'] == 'quitinetes')
df.loc[m3,'BEDROOMS'] = df.loc[m3,'BEDROOMS'].fillna(1)

m4 = (df['BEDROOMS'].isna()) & (df['TIPO'] == 'terrenos-lotes-condominios')
df.loc[m4,'BEDROOMS'] = df.loc[m4,'BEDROOMS'].fillna(1)

* ### Filling missing values: *sklearn Imputer*

In [None]:
df.BATHROOMS = df.BATHROOMS.replace({np.nan: np.nan})
df['PARKING-SPACES'] = df['PARKING-SPACES'].replace({np.nan: np.nan})

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
for c in ['BATHROOMS', 'PARKING-SPACES']:
    x = imputer.fit_transform(df[c].values.reshape(-1, 1))
    x = np.round(x, 0)
    df[c] = x

In [None]:
print('Resume Missing Values')
df.isnull().sum().sort_values(ascending=False)

# **2. Exploratory Data Analysis (EDA)**

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(10, 3), dpi=102)
df.TIPO.value_counts().plot(kind='bar', ax=ax1, color='#BA181B')
ax1.set_title('TIPO: counts', fontsize=10)
ax1.tick_params(labelsize=8)
plt.setp(ax1.get_xticklabels(), rotation=45, ha="right", fontsize=8, 
         rotation_mode="anchor");

In [None]:
fig, (ax1 ,ax2) = plt.subplots(1, 2, figsize=(16, 5), dpi=60)
sns.heatmap(df.corr(), annot=True, cmap='Reds', ax=ax1)
ax1.set_title('Correlations', fontsize=16)
ax2.set_title('Distributions log(PRICE)', fontsize=16)
sns.histplot(np.log1p(df.PRICE), kde=True, color='#660708', ax=ax2);

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8), dpi=300)
RED = '#BA181B'
group = df.groupby(['TIPO']).mean()['PRICE'].to_frame()
group.sort_values(by='PRICE', ascending=True).plot(kind='barh', ax=ax1, legend=None,
                                                   color=RED)
ax1.xaxis.set_major_formatter('${x:1,.0f}')
plt.setp(ax1.get_xticklabels(), rotation=45, ha="right", fontsize=16,
         rotation_mode="anchor")
df.groupby(['TIPO']).mean()['IPTU'].sort_values().plot(kind='barh', ax=ax2, color=RED)
ax2.xaxis.set_major_formatter('${x:1,.0f}')
ax1.set_ylabel(None)
ax2.set_ylabel(None)
plt.setp(ax2.get_xticklabels(), rotation=45, ha="right", fontsize=14, 
         rotation_mode="anchor")
df.groupby(['TIPO']).mean()['CONDOMÍNIO'].sort_values().plot(kind='barh', ax=ax3, color=RED)
ax3.xaxis.set_major_formatter('${x:1,.0f}')
plt.setp(ax3.get_xticklabels(), rotation=45, ha="right", fontsize=14, 
         rotation_mode="anchor")
ax3.set_ylabel(None)
ax1.set_title('PRICE (mean) $', fontsize=18)
ax2.set_title('IPTU (mean) $', fontsize=18)
ax3.set_title('CONDOMÍNIO (mean) $', fontsize=18)
ax1.tick_params(labelsize=16)
ax2.tick_params(labelsize=16)
ax3.tick_params(labelsize=16)
ax1.axvline(linewidth=3.5, color="black") 
ax2.axvline(linewidth=3.5, color="black")
ax3.axvline(linewidth=3.5, color="black")
fig.text(-0.02, 1, ' ')
fig.text(1.10, -0.05, ' ')
fig.tight_layout()

# **3. Feature Engineering**

In [None]:
one_hot = pd.get_dummies(df['TIPO'])
X = df.drop(['TIPO', 'DATE', 'ADDRESS'], axis=1)
X = X.join(one_hot)
y = np.log1p(df.PRICE)

# **4. Model validation**

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import QuantileTransformer

# LassoCV
lasso_model = LassoCV(n_alphas=150, max_iter=1e4, random_state=1)
# SVR
rbf_model = SVR(kernel='rbf', C=21, epsilon=0.0099, gamma=0.00017, tol=0.000121)

# Hist
hist_model = HistGradientBoostingRegressor(min_samples_leaf=40, max_depth=5, 
                                           max_iter=1000, learning_rate=0.15,
                                           loss='least_absolute_deviation', 
                                           random_state=1)
# lightgbm
lgbm_model = LGBMRegressor(objective='regression', n_estimators=2000, 
                           num_leaves=10, learning_rate=0.005,
                           max_bin=163, bagging_fraction=0.85, 
                           n_jobs=-1, bagging_seed=42, 
                           feature_fraction_seed=42, bagging_freq=7, 
                           feature_fraction=0.1294, 
                           min_data_in_leaf=8, random_state=1)

# xgboost
xgboost_model = XGBRegressor(learning_rate=0.0139, n_estimators=2000, 
                             max_depth=4, min_child_weight=0,
                             subsample=0.7968, colsample_bytree=0.4064, 
                             nthread=-1, scale_pos_weight=2,
                             seed=42, random_state=1)

* ### Build blend models

In [None]:
# Transformer
transformer = QuantileTransformer(output_distribution='normal')

# Models
hist = make_pipeline(transformer, hist_model)
xgboost = make_pipeline(transformer, xgboost_model)
lgbm = make_pipeline(transformer, lgbm_model)
lasso = make_pipeline(transformer, lasso_model)
svr = make_pipeline(transformer, rbf_model)

models = [('HistGradientBoosting', hist),
          ('XGBoost', xgboost), 
          ('LightGBM', lgbm),
          ('LassoCV', lasso),
          ('SVR', svr)]

In [None]:
def storm_model(x, y, models, cv, scoring):
    df_evaluation = pd.DataFrame()
    row_index = 0
    for name, model in models:
        # score
        scores = cross_validate(model, np.array(x), np.array(y).ravel(), cv=cv, 
                                scoring=scoring, n_jobs=-1, verbose=0)
        df_evaluation.loc[row_index, 'Model_Name'] = name
        for i in scoring:
            text = 'test_'+i
            df_evaluation.loc[row_index, i] = -1*scores[text].mean()
        row_index += 1
    df_evaluation.rename(columns = {'neg_mean_absolute_error': 'MAE',
                                    'neg_mean_squared_error': 'MSE', 
                                    'neg_root_mean_squared_error': 'RMSE'}, inplace = True)
    df_evaluation.sort_values(by=['RMSE'], ascending=True, inplace=True)
    df_evaluation.reset_index(drop=True, inplace=True)
    return (df_evaluation)


* ### Cross-Validation

In [None]:
%%time
from sklearn.model_selection import cross_validate, cross_val_predict, KFold

kfolds = KFold(n_splits=5, shuffle=True, random_state=1)
scoring = ['neg_mean_absolute_error', 
           'neg_mean_squared_error', 
           'neg_root_mean_squared_error']

# cross validate
df_score = storm_model(X, y, models, kfolds, scoring)

In [None]:
df_score.style.background_gradient(cmap='plasma')

# **5. Model selection and evaluation**


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=1)

print('=' * 16+'TRAIN'+'=' * 16)
print('X Train set:', X_train.shape)
print('Y Train set:', Y_train.shape)
print('=' * 16+'VAL'+'=' * 18)
print('X test set:', X_test.shape)
print('Y test set:', Y_test.shape)
print('=' * 37)

In [None]:
%%time
from sklearn.ensemble import VotingRegressor

# VotingRegressor
best_models = [('HistGradientBoosting', hist),
               ('XGBoost', xgboost)]

vr = VotingRegressor(best_models)
scores = cross_validate(vr, X, y, cv=kfolds, scoring=scoring, n_jobs=-1, verbose=0)
print(f"MAE score: {-1*scores['test_neg_mean_absolute_error'].mean()}")
print(f"RMSE score: {-1*scores['test_neg_root_mean_squared_error'].mean()}")

In [None]:
from sklearn.metrics import mean_squared_error

def fit_model(model, xtrain, ytrain, xval, yval):
    model.fit(xtrain, ytrain)
    pred = model.predict(xval)
    rmse = mean_squared_error(yval, pred, squared=False)
    return f'RMSE score: {rmse}'

In [None]:
%%time
vr_fit = fit_model(vr, X_train, Y_train, X_test, Y_test)
print('VotingRegressor Evaluation =>', vr_fit)

In [None]:
%%time
xg_fit = fit_model(xgboost, X_train, Y_train, X_test, Y_test)
print('XGBoost Evaluation =>', xg_fit)