In [27]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import TransformedTargetRegressor

In [28]:
import sys
import importlib
sys.path.append('..')
import preprocessing.pipeline
importlib.reload(preprocessing.pipeline)
from preprocessing.pipeline import get_preprocessor

In [29]:
train = pd.read_csv("../data/train.csv")
X_test = pd.read_csv("../data/test.csv")

X_train = train.drop("SalePrice", axis=1)
y_train = train["SalePrice"]

In [30]:
# === Build preprocessing + modeling pipeline ===
model_pipeline = Pipeline([
  ("preprocessor", get_preprocessor()),
  ("model", TransformedTargetRegressor(
		regressor=RandomForestRegressor(n_estimators=100, random_state=42),
		func=np.log1p,
		inverse_func=np.expm1
	))
])

# === Fit the model ===
model_pipeline.fit(X_train, y_train)

# === Predict on training data ===
y_pred = model_pipeline.predict(X_train)

# === Evaluate ===
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
r2 = r2_score(y_train, y_pred)

print(f"Train RMSE: {rmse:.2f}")
print(f"Train R² Score: {r2:.4f}")

Train RMSE: 10748.37
Train R² Score: 0.9817


In [32]:
# === Predict on test data ===
y_test_pred = model_pipeline.predict(X_test)

In [33]:
# === Wrap in DataFrame (e.g., for submission) ===
submission = pd.DataFrame({
  "Id": X_test["Id"],  # assuming 'Id' is in test data
  "SalePrice": y_test_pred
})

submission.head()

Unnamed: 0,Id,SalePrice
0,1461,124202.372111
1,1462,154596.987419
2,1463,179139.570973
3,1464,183541.290397
4,1465,195917.291446


In [None]:
submission.to_csv("../data/submission.csv", index=False)