In [24]:
import numpy as np
import pandas as pd
train = pd.read_csv('data/train.csv')

X = train.drop(["SalePrice", "Id"], axis=1)
y = np.log1p(train["SalePrice"])  # keep log target


In [25]:
#Columns where NaN means “None” (categorical)

none_cols = [
    "Alley", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "PoolQC", "Fence", "MiscFeature"
]

for col in none_cols:
    X[col] = X[col].fillna("None")


In [26]:
#Columns where NaN means 0 (numerical)
zero_cols = [
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
    "TotalBsmtSF", "GarageCars", "GarageArea", "MasVnrArea"
]

for col in zero_cols:
    X[col] = X[col].fillna(0)


In [27]:
#True missing numeric values
X["LotFrontage"] = X.groupby("Neighborhood")["LotFrontage"] \
                    .transform(lambda x: x.fillna(x.median()))

X["GarageYrBlt"] = X["GarageYrBlt"].fillna(X["YearBuilt"])


In [28]:
# Total square footage
X["TotalSF"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]

# Total bathrooms
X["TotalBathrooms"] = (
    X["FullBath"] +
    0.5 * X["HalfBath"] +
    X["BsmtFullBath"] +
    0.5 * X["BsmtHalfBath"]
)

# House age
X["HouseAge"] = X["YrSold"] - X["YearBuilt"]

# Remodeled or not
X["Remodeled"] = (X["YearRemodAdd"] != X["YearBuilt"]).astype(int)

# Binary presence features
X["HasGarage"] = (X["GarageArea"] > 0).astype(int)
X["HasBasement"] = (X["TotalBsmtSF"] > 0).astype(int)
X["HasFireplace"] = (X["Fireplaces"] > 0).astype(int)


In [29]:
#handle sweked num fesatures
skewed_features = [
    "LotArea", "GrLivArea", "TotalBsmtSF",
    "1stFlrSF", "2ndFlrSF"
]

for col in skewed_features:
    X[col] = np.log1p(X[col])


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])


In [31]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=10),
    "Lasso": Lasso(alpha=0.001, max_iter=10000)

}


In [32]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate(model, X_train, X_val, y_train, y_val):
    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)
    
    return rmse, r2


In [33]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [34]:
for name, model in models.items():
    rmse, r2 = evaluate(model, X_train, X_val, y_train, y_val)
    print(f"{name} → RMSE: {rmse:.4f} | R²: {r2:.4f}")


Linear → RMSE: 0.1533 | R²: 0.8741
Ridge → RMSE: 0.1309 | R²: 0.9081
Lasso → RMSE: 0.1343 | R²: 0.9034


In [35]:
best_model = Pipeline([
    ("prep", preprocessor),
    ("model", Ridge(alpha=10))
])

best_model.fit(X, y)


In [36]:
import joblib

joblib.dump(best_model, "ridge_house_prices.pkl")


['ridge_house_prices.pkl']