In [1]:
import importlib
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    project_root = os.path.abspath(os.path.join(current_dir, '..'))
else:
    project_root = current_dir

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to sys.path for module import.")

# --- Import functions from your src directory ---
from src.data_loader import load_and_initial_clean
from src.preprocessor import build_preprocessor

DATA_PATH = '../data/AmesHousing.csv' 

# Load and initially clean the data using your data_loader
df = load_and_initial_clean(DATA_PATH)
if df is None:
    print("Failed to load data. Exiting notebook.")
    exit()

# Separate target variable (saleprice) and features (X)
y = df['saleprice']
X = df.drop('saleprice', axis=1)

# Apply log transformation to the target variable
y_log = np.log1p(y)
print("Target variable 'saleprice' log-transformed to 'y_log'.")

preprocessor_pipeline_builder, X_processed_for_split = build_preprocessor(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed_for_split, y_log, test_size=0.2, random_state=42)

print(f"\nData successfully loaded, preprocessed, and split into training and testing sets.")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


Added c:\Users\asus\OneDrive\Desktop\projects\house_price_prediction to sys.path for module import.
DataFrame columns standardized using a robust method in data_loader.py.
Dropped 'order' column.
Dropped 'pid' column.
Data loaded and initially cleaned. Shape: (2930, 80)
Target variable 'saleprice' log-transformed to 'y_log'.
Starting feature engineering and preprocessor building...

DEBUG (preprocessor.py): Columns in X_processed after initial copy: ['ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area', 'street', 'alley', 'lot_shape', 'land_contour', 'utilities', 'lot_config', 'land_slope', 'neighborhood', 'condition_1', 'condition_2', 'bldg_type', 'house_style', 'overall_qual', 'overall_cond', 'year_built', 'year_remod_add', 'roof_style', 'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type', 'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'to

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

In [None]:
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Use OneHotEncoder for nominal
    
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [5]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, eval_metric='rmse')
}


In [None]:
param_grids = {
    'LinearRegression': {}, 
    'Ridge': {'regressor__alpha': [0.1, 1.0, 10.0]}, 
    'Lasso': {'regressor__alpha': [0.0001, 0.001, 0.01]},
    'RandomForest': {
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [10, 20, None] 
    },
    'XGBoost': {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.05, 0.1],
        'regressor__max_depth': [3, 5] 
    }
}

results = {}
best_trained_models = {}

# Cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)



for name, model in models.items():
    print(f"Training {name}...")
    full_pipeline = Pipeline(steps=[('preprocessor', preprocessor_pipeline_builder),
                                    ('regressor', model)])

    grid_search = GridSearchCV(
        full_pipeline,
        param_grids[name],
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train) 

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_) # Convert to RMSE

    # Make predictions 
    y_pred = best_model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results[name] = {
        'best_params': best_params,
        'cv_rmse': best_score,
        'test_r2': r2,
        'test_mae': mae,
        'test_rmse': rmse,
    }
    best_trained_models[name] = best_model 
    print(f"  Best parameters: {best_params}")
    print(f"  Cross-Validation RMSE: {best_score:.4f}")
    print(f"  Test R²: {r2:.4f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}\n")

print("\n--- All Model Training Complete ---")
for name, res in results.items():
    print(f"\nModel: {name}")
    for key, value in res.items():
        print(f"  {key}: {value}")
    print("\n")

Training LinearRegression...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
  Best parameters: {}
  Cross-Validation RMSE: 0.1425
  Test R²: 0.9283, MAE: 0.08, RMSE: 0.12

Training Ridge...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
  Best parameters: {'regressor__alpha': 10.0}
  Cross-Validation RMSE: 0.1410
  Test R²: 0.9307, MAE: 0.08, RMSE: 0.11

Training Lasso...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
  Best parameters: {'regressor__alpha': 0.0001}
  Cross-Validation RMSE: 0.1385
  Test R²: 0.9295, MAE: 0.08, RMSE: 0.11

Training RandomForest...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
  Best parameters: {'regressor__max_depth': 20, 'regressor__n_estimators': 200}
  Cross-Validation RMSE: 0.1425
  Test R²: 0.9238, MAE: 0.08, RMSE: 0.12

Training XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
  Best parameters: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 5, 'regressor__n_esti

In [None]:


import joblib
import os

models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)
print(f"\nModels directory '{models_dir}' ensured.")

best_overall_model_name = 'XGBoost' 
final_best_model = best_trained_models[best_overall_model_name]

# Save the best model
model_save_path = os.path.join(models_dir, f'best_house_price_model_{best_overall_model_name.lower()}.pkl')
joblib.dump(final_best_model, model_save_path)
print(f"Best model ({best_overall_model_name}) saved to: {model_save_path}")




Models directory '../models' ensured.
Best model (XGBoost) saved to: ../models\best_house_price_model_xgboost.pkl
