In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score


In [34]:
train_df = pd.read_csv('data/train.csv')

y_log = np.log(train_df["SalePrice"])
X = train_df.drop(['SalePrice', 'Id'], axis=1)


In [35]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)


In [36]:
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_features = X_train.select_dtypes(include=['object']).columns


In [37]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
#numerical pipeline

In [38]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
#categorical pipeline

In [39]:
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [40]:
ridge_model = Pipeline([
    ('preprocessor', preprocessor), 
    ('regressor', Ridge(alpha=1.0))
])

In [41]:
lasso_model = Pipeline([
    ('preprocess', preprocessor),
    ('model', Lasso(alpha=0.001))
])


In [42]:
def evaluate(model, X_train, X_val, y_train, y_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)
    return rmse, r2


In [43]:
lin_rmse, lin_r2 = evaluate(model, X_train, X_val, y_train, y_val)
ridge_rmse, ridge_r2 = evaluate(ridge_model, X_train, X_val, y_train, y_val)
lasso_rmse, lasso_r2 = evaluate(lasso_model, X_train, X_val, y_train, y_val)

print("Linear   → RMSE:", lin_rmse, " R²:", lin_r2)
print("Ridge    → RMSE:", ridge_rmse, " R²:", ridge_r2)
print("Lasso    → RMSE:", lasso_rmse, " R²:", lasso_r2)


Linear   → RMSE: 0.1280872069098814  R²: 0.9120839008448423
Ridge    → RMSE: 0.1317398302017774  R²: 0.9069982548716622
Lasso    → RMSE: 0.13748356075385892  R²: 0.8987118958206918


In [44]:
y_pred_log = model.predict(X)


In [46]:
y_pred = np.expm1(y_pred_log)
print(y_pred)


[205554.55463377 207644.92987736 214684.57525241 ... 264106.75459945
 140429.0627862  144295.30978065]
