In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor


In [2]:
df = pd.read_csv("data/train.csv")

X = df.drop(["SalePrice", "Id"], axis=1)
y = np.log1p(df["SalePrice"])


In [3]:
none_cols = [
    "Alley", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "PoolQC", "Fence", "MiscFeature"
]

for col in none_cols:
    X[col] = X[col].fillna("None")


In [4]:
zero_cols = [
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", "GarageCars", "GarageArea", "MasVnrArea"
]

for col in zero_cols:
    X[col] = X[col].fillna(0)


In [None]:
zero_cols = [
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", "GarageCars", "GarageArea", "MasVnrArea"
]

for col in zero_cols:
    X[col] = X[col].fillna(0)



In [6]:
X["LotFrontage"] = (
    X.groupby("Neighborhood")["LotFrontage"]
    .transform(lambda x: x.fillna(x.median()))
)

X["GarageYrBlt"] = X["GarageYrBlt"].fillna(X["YearBuilt"])


In [7]:
# Total square footage
X["TotalSF"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]

# Total bathrooms
X["TotalBathrooms"] = (
    X["FullBath"] +
    0.5 * X["HalfBath"] +
    X["BsmtFullBath"] +
    0.5 * X["BsmtHalfBath"]
)

# Age features
X["HouseAge"] = X["YrSold"] - X["YearBuilt"]
X["Remodeled"] = (X["YearRemodAdd"] != X["YearBuilt"]).astype(int)

# Binary indicators
X["HasGarage"] = (X["GarageArea"] > 0).astype(int)
X["HasBasement"] = (X["TotalBsmtSF"] > 0).astype(int)
X["HasFireplace"] = (X["Fireplaces"] > 0).astype(int)
X["HasPool"] = (X["PoolArea"] > 0).astype(int)

X["HasPorch"] = (
    (X["OpenPorchSF"] +
     X["EnclosedPorch"] +
     X["3SsnPorch"] +
     X["ScreenPorch"]) > 0
).astype(int)


In [8]:
skewed_features = [
    "LotArea", "GrLivArea", "TotalBsmtSF",
    "1stFlrSF", "2ndFlrSF"
]

for col in skewed_features:
    X[col] = np.log1p(X[col])


In [9]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])


In [10]:
xgb = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)


In [11]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", xgb)
])


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [14]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

RMSE: 0.1313427012282431
R²: 0.9075568630959735
