# Notebook

In [8]:
import sys, os
sys.path.append(os.path.abspath(".."))  # this makes src/ visible
import numpy as np, pandas as pd, joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from src.pipeline import add_features, build_preprocessor

df = pd.read_csv("../data/train.csv")
df = add_features(df)
preprocessor, feature_cols, y_col = build_preprocessor(df)

X = df[feature_cols]
y = df[y_col] if y_col == 'SalePriceLog' else np.log1p(df['SalePrice'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

pipes = {
    "LinearRegression": Pipeline([("prep", preprocessor), ("model", LinearRegression())]),
    "DecisionTree": Pipeline([("prep", preprocessor), ("model", DecisionTreeRegressor(random_state=42))])
}

for name, pipe in pipes.items():
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_valid)
   # rmse = mean_squared_error(y_valid, pred, squared=False)
    rmse = mean_squared_error(y_valid, pred) ** 0.5
    r2 = r2_score(y_valid, pred)
    print(name, "RMSE:", round(rmse,4), "R2:", round(r2,4))

LinearRegression RMSE: 0.1281 R2: 0.9121
DecisionTree RMSE: 0.2021 R2: 0.7811
