In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



In [25]:
df = pd.read_csv('data/train.csv')

X = df.drop(["SalePrice", "Id"], axis=1)
y = np.log1p(df["SalePrice"])  # keep log target

In [26]:
#Columns where NaN means “None” (categorical)

none_cols = [
    "Alley", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "PoolQC", "Fence", "MiscFeature"
]

for col in none_cols:
    X[col] = X[col].fillna("None")


In [27]:
#Columns where NaN means 0 (numerical)
zero_cols = [
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", "GarageCars", "GarageArea", "MasVnrArea"
]

for col in zero_cols:
    X[col] = X[col].fillna(0)


In [28]:
#True missing numeric values
X["LotFrontage"] = X.groupby("Neighborhood")["LotFrontage"] \
                    .transform(lambda x: x.fillna(x.median()))

X["GarageYrBlt"] = X["GarageYrBlt"].fillna(X["YearBuilt"])


In [29]:
# Total square footage
X["TotalSF"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]

# Total bathrooms
X["TotalBathrooms"] = (
    X["FullBath"] +
    0.5 * X["HalfBath"] +
    X["BsmtFullBath"] +
    0.5 * X["BsmtHalfBath"]
)

# House age
X["HouseAge"] = X["YrSold"] - X["YearBuilt"]

# Remodeled or not
X["Remodeled"] = (X["YearRemodAdd"] != X["YearBuilt"]).astype(int)

# Binary presence features
X["HasGarage"] = (X["GarageArea"] > 0).astype(int)
X["HasBasement"] = (X["TotalBsmtSF"] > 0).astype(int)
X["HasFireplace"] = (X["Fireplaces"] > 0).astype(int)
X["HasPool"] = (X["PoolArea"] > 0).astype(int)
X["HasPorch"] = (
    (X["OpenPorchSF"] + X["EnclosedPorch"] +
     X["3SsnPorch"] + X["ScreenPorch"]) > 0
).astype(int)



In [30]:
#handle sweked num fesatures
skewed_features = [
    "LotArea", "GrLivArea", "TotalBsmtSF",
    "1stFlrSF", "2ndFlrSF"
]

for col in skewed_features:
    X[col] = np.log1p(X[col])


In [31]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.impute import SimpleImputer

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])


In [32]:
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", GradientBoostingRegressor(
        n_estimators=1200,
        learning_rate=0.03,
        max_depth=3,
        subsample=0.8,
        random_state=42
    ))
])


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [34]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R^2 Score: {r2}")


RMSE: 0.13164525932898857
R^2 Score: 0.9071304726756304
