In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything from data, so fit does nothing
        return self

    def transform(self, X):
        # Work on a copy to avoid SettingWithCopy warnings
        X = X.copy()
        
        # --- 1. Handling Specific Missing Values (Logic from your notebook) ---
        # Numerical constants
        zero_fill_cols = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea', 'GarageCars']
        for col in zero_fill_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0)
                
        # GarageYrBlt: fill with YearBuilt if missing
        if 'GarageYrBlt' in X.columns and 'YearBuilt' in X.columns:
            X['GarageYrBlt'] = X['GarageYrBlt'].fillna(X['YearBuilt'])
            
        # LotFrontage: For simplicity in production, we fill with median (Simplifying your GroupBy logic for stability)
        if 'LotFrontage' in X.columns:
            X['LotFrontage'] = X['LotFrontage'].fillna(X['LotFrontage'].median())

        # --- 2. Feature Creation ---
        # Total SF
        if all(c in X.columns for c in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']):
            X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
            
        # Total Bathrooms
        if all(c in X.columns for c in ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']):
            X['TotalBathrooms'] = (X['FullBath'] + (0.5 * X['HalfBath']) + 
                                   X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath']))
            
        # Ages
        # Note: In production, 'YrSold' might not exist (we predict future/current). 
        # We usually use the current year, but for consistency with your training, we keep it if provided.
        current_year = 2010 # Fixed year from dataset to prevent negative ages in future
        if 'YearBuilt' in X.columns:
            X['HouseAge'] = current_year - X['YearBuilt']
        if 'YearRemodAdd' in X.columns:
            X['RemodAge'] = current_year - X['YearRemodAdd']
            
        # Binary Features
        if 'PoolArea' in X.columns:
            X['HasPool'] = (X['PoolArea'] > 0).astype(int)
        if 'GarageArea' in X.columns:
            X['HasGarage'] = (X['GarageArea'] > 0).astype(int)
        if 'Fireplaces' in X.columns:
            X['HasFireplace'] = (X['Fireplaces'] > 0).astype(int)

        # Drop columns used for calculation to clean up (optional, but keeps data small)
        cols_to_drop = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'YearBuilt', 'YearRemodAdd', 'PoolArea']
        X = X.drop(columns=[c for c in cols_to_drop if c in X.columns], errors='ignore')
        
        return X

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
import joblib

# Load Data
df = pd.read_csv('house.csv') # Make sure this matches your file path
X = df.drop(['Id', 'SalePrice'], axis=1)
y = np.log(df['SalePrice']) # Log transform target as you did in notebook

# Define critical columns (simplified list for the tutorial to ensure it runs smooth)
# These are the ones we will actually accept input for in the app
num_features = ['LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'TotalSF', 'TotalBathrooms', 'BedroomAbvGr', 'TotRmsAbvGrd', 'GarageCars']
cat_features = ['MSZoning', 'Neighborhood', 'HouseStyle', 'ExterQual', 'KitchenQual']

# We need to ensure the FeatureEngineer runs BEFORE column selection, or we pass everything through
# Strategy: Preprocessing Pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ],
    remainder='drop' # Drop columns we didn't specify to keep the app simple
)

In [5]:
# Create the full pipeline
# 1. Generate new features
# 2. Preprocess (Impute, Scale, Encode)
# 3. Predict
model_pipeline = Pipeline(steps=[
    ('feature_eng', FeatureEngineer()),
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Fit the model
print("Training model...")
model_pipeline.fit(X, y)
print("Model trained!")

# Save the model
joblib.dump(model_pipeline, 'house_price_model.pkl')
print("Model saved as 'house_price_model.pkl'")

Training model...
Model trained!
Model saved as 'house_price_model.pkl'
