In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [1]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df.head()

In [1]:
df.shape

# Métricas
* MSE (Mean Squared Error)
* RMSE (Root Mean Squared Error)
* RMSLE (Root Mean Squared Logarithmic Error)
* MAE (Mean Absolute Error)
* MedAE (Median Absolute Percentage Error)
* MAPE (Mean Absolute Percentage Error)
* R2 (R-Squared)

In [1]:
key_var = 'Id'
target = 'SalePrice'
cat_vars = df.select_dtypes(include='object').columns.to_list()
num_vars = [col for col in df.columns if col not in cat_vars + [target, key_var]]
features = cat_vars + num_vars

X = df[features].copy()
y = df[target].copy()

In [1]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [1]:
# !pip install feature-engine

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from feature_engine.missing_data_imputers import ArbitraryNumberImputer, CategoricalVariableImputer
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

model = Pipeline(steps=[
    ('numeric_imputer', ArbitraryNumberImputer(arbitrary_number=-999, variables=num_vars)),
    ('categoric_imputer', CategoricalVariableImputer(fill_value='missing', variables=cat_vars, return_object=True)),
    ('one_hot_encoder', OneHotCategoricalEncoder(variables=cat_vars)),
    ('model', RandomForestRegressor(n_estimators=500, max_depth=5, random_state=42))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(8, 6))

sns.regplot(x=y_test, y=y_pred, ax=ax);
ax.set_title('Regression Plot', fontsize=20);
ax.set_xlabel('Sale Price', fontsize=15)
ax.set_ylabel('Predicted Price', fontsize=15);

# MSE - Mean Squared Error

In [1]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f'MSE = {mse:.3f}')

# RMSE - Root Mean Squared Error

In [1]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE = {rmse:.3f}')

# RMSLE - Root Mean Squared Logarithmic Error

In [1]:
from sklearn.metrics import mean_squared_log_error

rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f'RMSLE = {rmsle:.3f}')

# MAE - Mean Absolute Error

In [1]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print(f'MAE = {mae:.3f}')

# MedAE - Median Absolute Error

In [1]:
from sklearn.metrics import median_absolute_error

medae = median_absolute_error(y_test, y_pred)
print(f'MedAE = {medae:.3f}')

# MAPE - Mean Absolute Percentual Error

In [1]:
def mape(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) / y_test))

print(f'MAPE = {mape(y_test, y_pred):.3f}')

# R2 - R-Squared

In [1]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f'R2 = {r2:.3f}')