# Kaggle House Prices — Robust Blend + Stack (Ready-to-Submit)

**What you get**
- Clean 5-fold CV (RMSE on log target)
- Safe outlier removal + numeric tail clipping
- High-signal feature engineering (TotalSF, TotalBath, ages, interactions)
- log1p transform for skewed numerics
- Rare-category collapsing before OHE
- Strong baselines (Ridge/Lasso/ENet/GBR/RF) + optional XGB/CatBoost
- Stacking (passthrough=True) and NNLS learned blend on out-of-fold (OOF)
- Final fit on all train + blended test predictions + submission.csv

**How to use**
1. Put `train.csv` and `test.csv` in the working directory (Kaggle provides them by default).
2. Run all cells in order (Runtime → Run All).
3. Upload the generated `submission.csv`.


In [None]:
# 0) Imports & global config
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import skew

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.base import clone
from numpy.linalg import lstsq

SEED = 42
KFOLD = KFold(n_splits=5, shuffle=True, random_state=SEED)
DATA_DIR = Path('.')
SUB_PATH = Path('submission.csv')

pd.set_option('display.max_columns', 200)
print('Setup OK')


Setup OK


In [None]:
from google.colab import files
uploaded = files.upload()

Saving sample_submission.csv to sample_submission.csv
Saving test.csv to test.csv
Saving train.csv to train.csv


In [None]:
# 1) Load data
train = pd.read_csv(DATA_DIR / 'train.csv')
test  = pd.read_csv(DATA_DIR / 'test.csv')
print(train.shape, test.shape)
train.head(2)


(1460, 81) (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500


In [None]:
# 2) Remove known outliers and clip numeric tails
train = train.copy()
test  = test.copy()

# Outlier rule from Ames folklore
mask_out = (train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)
removed = int(mask_out.sum())
train = train.loc[~mask_out].reset_index(drop=True)
print(f"Outliers removed: {removed}")

# Clip numeric tails to stabilize models
NUMS = train.select_dtypes(include=np.number).columns.drop('SalePrice')
lo = train[NUMS].quantile(0.005)
hi = train[NUMS].quantile(0.995)
train[NUMS] = train[NUMS].clip(lo, hi, axis=1)
test[NUMS]  = test[NUMS].clip(lo, hi, axis=1)
print('Tail clipping done')


Outliers removed: 2
Tail clipping done


  train[NUMS] = train[NUMS].clip(lo, hi, axis=1)
  test[NUMS]  = test[NUMS].clip(lo, hi, axis=1)


In [None]:
# 3) Feature engineering
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    df['TotalBath'] = (df['FullBath'] + 0.5*df['HalfBath'] + df.get('BsmtFullBath',0) + 0.5*df.get('BsmtHalfBath',0))
    df['TotalSF'] = df.get('TotalBsmtSF',0) + df.get('1stFlrSF',0) + df.get('2ndFlrSF',0)
    df['OverallQual_GrLivArea'] = df['OverallQual'] * df['GrLivArea']
    df['PorchSF'] = (df.get('OpenPorchSF',0)+df.get('EnclosedPorch',0)+ df.get('3SsnPorch',0)+df.get('ScreenPorch',0))
    return df

train = add_features(train)
test  = add_features(test)
print('Feature engineering done')
train[['SalePrice','TotalSF','TotalBath','HouseAge','RemodAge','OverallQual_GrLivArea','PorchSF']].head(3)


Feature engineering done


Unnamed: 0,SalePrice,TotalSF,TotalBath,HouseAge,RemodAge,OverallQual_GrLivArea,PorchSF
0,208500,2566.0,3.5,5,5,11970.0,61.0
1,181500,2524.0,2.5,31,31,7572.0,0.0
2,223500,2706.0,3.5,7,6,12502.0,42.0


In [None]:
# 4) Log1p transform for skewed numerics (safe: only for non-negative cols)
def log1p_skewed(df: pd.DataFrame, exclude=()) -> pd.DataFrame:
    df = df.copy()
    num = df.select_dtypes(include=np.number).columns.difference(exclude)
    from scipy.stats import skew
    sk = df[num].apply(lambda x: skew(x.dropna()))
    skewed = sk[sk > 0.75].index
    for c in skewed:
        if (df[c] >= 0).all():
            df[c] = np.log1p(df[c])
    return df

train = log1p_skewed(train, exclude=('SalePrice',))
test  = log1p_skewed(test, exclude=())
print('Skew transform done')


Skew transform done


  sk = df[num].apply(lambda x: skew(x.dropna()))


In [None]:
# 5) Collapse rare categories to reduce OHE noise
def collapse_rare_cats(df: pd.DataFrame, min_count=20) -> pd.DataFrame:
    df = df.copy()
    for c in df.select_dtypes('object').columns:
        vc = df[c].value_counts(dropna=False)
        rare = vc[vc < min_count].index
        df[c] = df[c].where(~df[c].isin(rare), 'RARE')
    return df

train = collapse_rare_cats(train, min_count=20)
test  = collapse_rare_cats(test,  min_count=20)
print('Rare categories collapsed')


Rare categories collapsed


In [None]:
# 6) Preprocessor (version-proof: OneHotEncoder(sparse=False))
target_col = 'SalePrice'
y_log = np.log1p(train[target_col])
X = train.drop(columns=[target_col])
X_test = test.copy()

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('imp', SimpleImputer(strategy='median'))]), num_cols),
        ('cat', Pipeline([
            ('imp', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols),
    ],
    remainder='drop',
    verbose_feature_names_out=False
)
print(f'num: {len(num_cols)}, cat: {len(cat_cols)}')

num: 43, cat: 43


In [None]:
# 7) Define models (linear + trees); optional XGB/CatBoost if available
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

ridge = Pipeline([
    ('prep', preprocessor),
    ('mdl', RidgeCV(alphas=np.logspace(-3, 3, 61), cv=5))
])

lasso = Pipeline([
    ('prep', preprocessor),
    ('mdl', LassoCV(alphas=np.logspace(-4, 1, 60), cv=5, max_iter=20000, n_jobs=-1))
])

enet = Pipeline([
    ('prep', preprocessor),
    ('mdl', ElasticNetCV(l1_ratio=[.05,.1,.3,.5,.7,.9,.95,1.0],
                         alphas=np.logspace(-4, 1, 40), cv=5, max_iter=20000, n_jobs=-1))
])

gbr = Pipeline([
    ('prep', preprocessor),
    ('mdl', GradientBoostingRegressor(
        n_estimators=1200, learning_rate=0.03, max_depth=3, subsample=0.9,
        min_samples_leaf=3, random_state=42))
])

rf = Pipeline([
    ('prep', preprocessor),
    ('mdl', RandomForestRegressor(
        n_estimators=1200, max_depth=None, min_samples_split=4, min_samples_leaf=1,
        max_features='sqrt', n_jobs=-1, random_state=42))
])

models = {
    'Ridge': ridge,
    'Lasso': lasso,
    'ElasticNet': enet,
    'GBR': gbr,
    'RF': rf,
}

try:
    from xgboost import XGBRegressor
    xgb = Pipeline([
        ('prep', preprocessor),
        ('mdl', XGBRegressor(
            n_estimators=2500, learning_rate=0.03, max_depth=3,
            subsample=0.8, colsample_bytree=0.6, reg_lambda=1.0,
            objective='reg:squarederror', random_state=42, n_jobs=-1))
    ])
    models['XGB'] = xgb
    print('XGB ready')
except Exception as e:
    print('XGB not available:', e)

try:
    from catboost import CatBoostRegressor
    cat = Pipeline([
        ('prep', preprocessor),
        ('mdl', CatBoostRegressor(depth=6, learning_rate=0.03, n_estimators=3000,
                                  loss_function='RMSE', random_seed=42, verbose=False))
    ])
    models['CAT'] = cat
    print('CatBoost ready')
except Exception as e:
    print('CatBoost not available:', e)

list(models.keys())


XGB ready
CatBoost not available: No module named 'catboost'


['Ridge', 'Lasso', 'ElasticNet', 'GBR', 'RF', 'XGB']

In [None]:
# 8) CV helper (RMSE on log target)
from sklearn.model_selection import cross_val_score

def rmse_log_cv(model, X, y, cv=KFOLD):
    mse = -cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    return np.sqrt(mse)

for name, mdl in models.items():
    scores = rmse_log_cv(mdl, X, y_log, cv=KFOLD)
    print(f"{name:10s}: {scores.mean():.5f} ± {scores.std():.5f}")


Ridge     : 0.11608 ± 0.00929
Lasso     : 0.11413 ± 0.00917
ElasticNet: 0.11409 ± 0.00913
GBR       : 0.11886 ± 0.00724
RF        : 0.13331 ± 0.00888
XGB       : 0.11675 ± 0.00776


In [None]:
# 9) Stacking model (optional to include in blend)
from sklearn.ensemble import StackingRegressor
base_estimators = []
for key in ['Ridge','Lasso','ElasticNet','GBR','RF']:
    base_estimators.append((key.lower(), models[key]))
for key in ['XGB','CAT']:
    if key in models:
        base_estimators.append((key.lower(), models[key]))

# Preprocess the data before stacking
X_processed = preprocessor.fit_transform(X)

stack = StackingRegressor(
    estimators=base_estimators,
    final_estimator=RidgeCV(alphas=np.logspace(-3,3,61), cv=5),
    passthrough=True,
    n_jobs=-1
)
models['STACK'] = stack
print('Stacking model added')

Stacking model added


In [None]:
# 10) Build OOF matrix for blend weights (excluding STACK to avoid nested OOF cost)
def oof_matrix(models_dict, Xdf, y, cv=KFOLD, include_keys=None):
    from sklearn.base import clone
    if include_keys is None:
        include_keys = list(models_dict.keys())
    preds = {}
    for name in include_keys:
        if name == 'STACK':
            # Skip to keep runtime reasonable
            continue
        pipe = models_dict[name]
        oof = np.zeros(len(Xdf))
        for tr, te in cv.split(Xdf, y):
            m = clone(pipe)
            m.fit(Xdf.iloc[tr], y.iloc[tr])
            oof[te] = m.predict(Xdf.iloc[te])
        preds[name] = oof
        print(f'OOF done: {name}')
    import pandas as pd
    return pd.DataFrame(preds)

oof = oof_matrix(models, X, y_log, cv=KFOLD, include_keys=list(models.keys()))
oof.head()


OOF done: Ridge
OOF done: Lasso


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


OOF done: ElasticNet
OOF done: GBR
OOF done: RF
OOF done: XGB


Unnamed: 0,Ridge,Lasso,ElasticNet,GBR,RF,XGB
0,12.222596,12.222665,12.222766,12.177136,12.22172,12.188487
1,12.108252,12.140907,12.141506,12.041149,12.023269,12.065918
2,12.257722,12.270528,12.26943,12.244874,12.266237,12.264208
3,12.061442,12.076062,12.074288,12.160219,12.050947,12.09074
4,12.56338,12.572509,12.572719,12.609546,12.557616,12.609119


In [None]:
# 11) Learn NNLS (approx via non-negative clipped least squares)
from numpy.linalg import lstsq
w, *_ = lstsq(oof.values, y_log.values, rcond=None)
w = np.clip(w, 0, None)
w = w / (w.sum() + 1e-12)
import pandas as pd
blend_weights = pd.Series(w, index=oof.columns).sort_values(ascending=False)
print('Blend weights (sum=1):')
blend_weights


Blend weights (sum=1):


Unnamed: 0,0
ElasticNet,0.76878
XGB,0.198394
RF,0.032826
Ridge,0.0
Lasso,0.0
GBR,0.0


In [None]:
# 12) Fit models on all training data and predict test
fitted = {}
for name, mdl in models.items():
    print('Fitting', name)
    mdl.fit(X, y_log)
    fitted[name] = mdl
print('All models fitted')

import pandas as pd
test_preds_log = {}
for name, mdl in fitted.items():
    test_preds_log[name] = mdl.predict(X_test)
test_preds_log = pd.DataFrame(test_preds_log)
test_preds_log.head()


Fitting Ridge
Fitting Lasso
Fitting ElasticNet
Fitting GBR
Fitting RF
Fitting XGB
Fitting STACK


ValueError: 
All the 305 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
305 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_ridge.py", line 1239, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'RL'


In [None]:
# 13) Blend test predictions using learned weights and save submission
common_models = [m for m in blend_weights.index if m in test_preds_log.columns]
W = blend_weights.loc[common_models]
P = test_preds_log[common_models]
blend_log = np.dot(P.values, W.values)
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': np.expm1(blend_log)
})
submission.to_csv('submission.csv', index=False)
print('Saved submission.csv')
submission.head()


NameError: name 'test_preds_log' is not defined

In [None]:
files.download('submission.csv')

FileNotFoundError: Cannot find file: submission.csv

### Notes
- All models are trained on `log1p(SalePrice)`. Predictions are turned back with `expm1`.
- If XGBoost/CatBoost aren’t installed, the pipeline still runs with sklearn models.
- To speed up: reduce estimators or folds; to push score, tune XGB/CAT params and keep 5–10 folds.
