In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import json
import joblib

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
DATA_DIR = Path("../data")
MODELS_DIR = Path("../models")
MODELS_DIR.mkdir(exist_ok=True)

RANDOM_STATE = 42

def rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 4) -> float:
    """Compute RMSLE on original target scale."""
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    score = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(float(score), precision)


In [None]:
features = ["LotShape", "HouseStyle", "LandContour", "Utilities", "RoofStyle",
            "LotArea", "MSSubClass", "OverallQual", "OverallCond"]
target_col = "SalePrice"

train_df = pd.read_csv(DATA_DIR / "train.csv")
X_full = train_df[features].copy()
y_full = train_df[target_col].copy()

y_full_log = np.log(y_full)

X_train, X_val, y_train_log, y_val_log = train_test_split(
    X_full, y_full_log, test_size=0.2, random_state=RANDOM_STATE
)

num_cols = ["LotArea", "MSSubClass", "OverallQual", "OverallCond"]
cat_cols = ["LotShape", "HouseStyle", "LandContour", "Utilities", "RoofStyle"]

print(f"Train split: X_train={X_train.shape}, X_val={X_val.shape}")
print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)


Train split: X_train=(1168, 9), X_val=(292, 9)
Numeric cols: ['LotArea', 'MSSubClass', 'OverallQual', 'OverallCond']
Categorical cols: ['LotShape', 'HouseStyle', 'LandContour', 'Utilities', 'RoofStyle']


In [None]:
num_imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()

try:
    ohe = OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", drop="first", sparse=False)

cat_imputer = SimpleImputer(strategy="most_frequent")
num_imputer.fit(X_train[num_cols])
Xn_train = num_imputer.transform(X_train[num_cols])
scaler.fit(Xn_train)
Xn_train = scaler.transform(Xn_train)

cat_imputer.fit(X_train[cat_cols])
Xc_train_in = cat_imputer.transform(X_train[cat_cols])
ohe.fit(Xc_train_in)
Xc_train = ohe.transform(Xc_train_in)

num_feat_names = num_cols
cat_feat_names = ohe.get_feature_names_out(cat_cols).tolist()
train_columns = num_feat_names + cat_feat_names

X_train_proc = pd.DataFrame(
    np.hstack([Xn_train, Xc_train]),
    columns=train_columns,
    index=X_train.index
)

Xn_val = scaler.transform(num_imputer.transform(X_val[num_cols]))
Xc_val = ohe.transform(cat_imputer.transform(X_val[cat_cols]))

X_val_proc = pd.DataFrame(
    np.hstack([Xn_val, Xc_val]),
    columns=train_columns,
    index=X_val.index
)
actual_processed_df = X_train_proc.copy()

X_train_proc.head()


Unnamed: 0,LotArea,MSSubClass,OverallQual,OverallCond,LotShape_IR2,LotShape_IR3,LotShape_Reg,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,...,HouseStyle_SLvl,LandContour_HLS,LandContour_Low,LandContour_Lvl,Utilities_NoSeWa,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed
254,-0.212896,-0.866764,-0.820445,0.372217,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1066,-0.265245,0.07411,-0.088934,1.268609,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
638,-0.177841,-0.631546,-0.820445,1.268609,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
799,-0.324474,-0.161109,-0.820445,1.268609,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
380,-0.529035,-0.161109,-0.820445,0.372217,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
model = Ridge() 
model.fit(X_train_proc, y_train_log)
print(" Model trained.")


✅ Model trained.


In [None]:
y_val_log_pred = model.predict(X_val_proc)
y_val_pred = np.exp(y_val_log_pred)
y_val_actual = np.exp(y_val_log)

val_rmsle = rmsle(y_val_actual, y_val_pred)
print({"rmsle": val_rmsle})


{'rmsle': 0.2196}


In [None]:
joblib.dump(model, MODELS_DIR / "model.joblib")
joblib.dump(num_imputer, MODELS_DIR / "num_imputer.joblib")
joblib.dump(cat_imputer, MODELS_DIR / "cat_imputer.joblib")
joblib.dump(scaler, MODELS_DIR / "scaler.joblib")
joblib.dump(ohe, MODELS_DIR / "ohe.joblib")

metadata = {
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "feature_order": train_columns
}
(MODELS_DIR / "metadata.json").write_text(json.dumps(metadata))

print("Artifacts saved in:", MODELS_DIR.resolve())


✅ Artifacts saved in: C:\Users\srika\Documents\dspro\dsp-srikanth-banoth\models


In [None]:
baseline_path = MODELS_DIR / "processed_df.parquet"
if baseline_path.exists():
    expected_processed_df = pd.read_parquet(baseline_path)
    pd.testing.assert_frame_equal(
        actual_processed_df.reset_index(drop=True),
        expected_processed_df.reset_index(drop=True),
        check_dtype=True
    )
    print(" Behavior unchanged — processed dataframe is identical to PW1 baseline.")
else:
    print(" No baseline parquet found at", baseline_path, "— skipping regression test.")


ℹ️ No baseline parquet found at ..\models\processed_df.parquet — skipping regression test.


In [None]:
test_df = pd.read_csv(DATA_DIR / "test.csv")
model_l = joblib.load(MODELS_DIR / "model.joblib")
num_imputer_l = joblib.load(MODELS_DIR / "num_imputer.joblib")
cat_imputer_l = joblib.load(MODELS_DIR / "cat_imputer.joblib")
scaler_l = joblib.load(MODELS_DIR / "scaler.joblib")
ohe_l = joblib.load(MODELS_DIR / "ohe.joblib")

meta = json.loads((MODELS_DIR / "metadata.json").read_text())
num_cols_l = meta["num_cols"]
cat_cols_l = meta["cat_cols"]
feature_order_l = meta["feature_order"]
Xn_test = scaler_l.transform(num_imputer_l.transform(test_df[num_cols_l]))
Xc_test = ohe_l.transform(cat_imputer_l.transform(test_df[cat_cols_l]))

X_test_proc = pd.DataFrame(
    np.hstack([Xn_test, Xc_test]),
    columns=(num_cols_l + ohe_l.get_feature_names_out(cat_cols_l).tolist()),
    index=test_df.index
)
X_test_proc = X_test_proc.reindex(columns=feature_order_l, fill_value=0.0)
test_log_pred = model_l.predict(X_test_proc)
test_pred = np.exp(test_log_pred)
pd.DataFrame({"Id": test_df.get("Id", pd.Series(range(len(test_pred)))), "SalePrice": test_pred}).head()


Unnamed: 0,Id,SalePrice
0,1461,131793.734661
1,1462,182748.446853
2,1463,145972.294663
3,1464,178774.423335
4,1465,243513.368139
