In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import RidgeCV # New model for the ensemble
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import skew

# --- 1. Data Loading ---
def load_data():
    """Loads the training and testing datasets."""
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print("Data loaded successfully.")
        return train_df, test_df
    except FileNotFoundError:
        print("Error: train.csv or test.csv not found.")
        return None, None

# --- 2. Preprocessing and Feature Engineering ---
def preprocess(train_df, test_df):
    """Handles missing values, feature engineering, and normalization."""
    print("Starting improved preprocessing...")
    test_ids = test_df['Id']
    train_df = train_df.drop('Id', axis=1)
    test_df = test_df.drop('Id', axis=1)

    train_df = train_df.drop(train_df[(train_df['GrLivArea']>4000) & (train_df['SalePrice']<300000)].index)
    print(f"Removed outliers. New train shape: {train_df.shape}")

    train_df['SalePrice'] = np.log1p(train_df['SalePrice'])
    y = train_df['SalePrice']

    all_data = pd.concat((train_df.drop('SalePrice', axis=1), test_df))
    print(f"Combined data shape: {all_data.shape}")

    # Fill missing values...
    for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType',
                'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
        all_data[col] = all_data[col].fillna('None')
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
                'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
        all_data[col] = all_data[col].fillna(0)
    for col in ('MSZoning', 'Utilities', 'Functional', 'Exterior1st', 'Exterior2nd',
                'KitchenQual', 'SaleType', 'Electrical'):
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))

    # Feature Engineering...
    print("Creating new features...")
    all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
    all_data['Total_Bathrooms'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                                   all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])
    all_data['Total_Porch_SF'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                                  all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                                  all_data['WoodDeckSF'])
    all_data['YearBuilt_Age'] = all_data['YrSold'] - all_data['YearBuilt']
    all_data['YearRemod_Age'] = all_data['YrSold'] - all_data['YearRemodAdd']
    all_data['OverallQual_sq'] = all_data['OverallQual']**2
    all_data['TotalSF_sq'] = all_data['TotalSF']**2
    all_data['GrLivArea_sq'] = all_data['GrLivArea']**2
    all_data['OverallQual_x_TotalSF'] = all_data['OverallQual'] * all_data['TotalSF']
    all_data['GrLivArea_x_OverallQual'] = all_data['GrLivArea'] * all_data['OverallQual']

    # Skewness Transform...
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.75].index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    
    # One-Hot Encoding...
    all_data = pd.get_dummies(all_data)

    X = all_data[:len(y)]
    X_test_competition = all_data[len(y):]

    # Normalize...
    print("Normalizing data using StandardScaler...")
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    X_test_competition_scaled = scaler.transform(X_test_competition)
    X = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)
    X_test_competition = pd.DataFrame(X_test_competition_scaled, index=X_test_competition.index, columns=X_test_competition.columns)
    
    print(f"Preprocessing complete.")
    return X, y, X_test_competition, test_ids

# --- 3. Main Stacking and Submission Function ---
def stacking_and_predict(X, y, X_test_competition, test_ids):
    """Performs stacking with cross-validation and generates a submission."""
    print("\n--- Starting Stacking Ensemble ---")
    
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # --- Step 1: Generate Out-of-Fold (OOF) predictions for base models ---
    
    # Placeholders for OOF and test predictions
    oof_preds_lgb = np.zeros(X.shape[0])
    oof_preds_xgb = np.zeros(X.shape[0])
    oof_preds_ridge = np.zeros(X.shape[0])
    
    test_preds_lgb = np.zeros(X_test_competition.shape[0])
    test_preds_xgb = np.zeros(X_test_competition.shape[0])
    test_preds_ridge = np.zeros(X_test_competition.shape[0])

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f"--- Fold {fold+1}/10 ---")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # --- a) LightGBM ---
        lgb_model = lgb.LGBMRegressor(
            objective='regression', metric='rmse', n_estimators=3000,
            learning_rate=0.01, lambda_l1=0.01, lambda_l2=0.1,
            num_leaves=40, feature_fraction=0.8, bagging_fraction=0.8,
            bagging_freq=1, verbose=-1, n_jobs=-1, seed=42
        )
        lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                      eval_metric='rmse', callbacks=[lgb.early_stopping(100, verbose=False)])
        oof_preds_lgb[val_index] = lgb_model.predict(X_val)
        test_preds_lgb += lgb_model.predict(X_test_competition) / kf.n_splits

        # --- b) XGBoost ---
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror', eval_metric='rmse', n_estimators=3000,
            learning_rate=0.01, max_depth=4, subsample=0.8,
            colsample_bytree=0.7, reg_alpha=0.01, reg_lambda=0.1,
            random_state=42, n_jobs=-1
        )
        try:
            xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
        except TypeError:
            xgb_model.fit(X_train, y_train)
        oof_preds_xgb[val_index] = xgb_model.predict(X_val)
        test_preds_xgb += xgb_model.predict(X_test_competition) / kf.n_splits
        
        # --- c) Ridge Regression ---
        ridge_model = RidgeCV(alphas=[1, 5, 10, 20, 50, 100])
        ridge_model.fit(X_train, y_train)
        oof_preds_ridge[val_index] = ridge_model.predict(X_val)
        test_preds_ridge += ridge_model.predict(X_test_competition) / kf.n_splits

    # --- Step 2: Create meta-features and train the meta-model ---
    
    # Create the training data for the meta-model
    X_meta_train = pd.DataFrame({
        'lgb': oof_preds_lgb,
        'xgb': oof_preds_xgb,
        'ridge': oof_preds_ridge
    })
    
    # Create the test data for the meta-model
    X_meta_test = pd.DataFrame({
        'lgb': test_preds_lgb,
        'xgb': test_preds_xgb,
        'ridge': test_preds_ridge
    })

    print("\n--- Training Meta-Model ---")
    meta_model = RidgeCV()
    meta_model.fit(X_meta_train, y)
    
    # Evaluate the stacked model's performance on the OOF predictions
    oof_stacked_preds = meta_model.predict(X_meta_train)
    stacked_rmse = np.sqrt(mean_squared_error(y, oof_stacked_preds))
    print(f"Overall Stacked CV RMSE: {stacked_rmse:.5f}")

    # --- Step 3: Make final predictions ---
    
    final_predictions_log = meta_model.predict(X_meta_test)
    final_predictions = np.expm1(final_predictions_log)

    # Create submission file
    submission = pd.DataFrame({'Id': test_ids, 'SalePrice': final_predictions})
    submission.to_csv('submission.csv', index=False)
    
    print("\n--- Submission Complete ---")
    print("Submission file 'submission.csv' created successfully.")
    print(submission.head())


# --- Main Execution ---
if __name__ == '__main__':
    train_df, test_df = load_data()
    if train_df is not None and test_df is not None:
        X, y, X_test_competition, test_ids = preprocess(train_df, test_df)
        
        # This single function now handles all training, ensembling, and prediction
        stacking_and_predict(X, y, X_test_competition, test_ids)

Data loaded successfully.
Starting improved preprocessing...
Removed outliers. New train shape: (1458, 80)
Combined data shape: (2917, 79)
Creating new features...
Normalizing data using StandardScaler...
Preprocessing complete.

--- Starting Stacking Ensemble ---
--- Fold 1/10 ---
--- Fold 2/10 ---
--- Fold 3/10 ---
--- Fold 4/10 ---
--- Fold 5/10 ---
--- Fold 6/10 ---
--- Fold 7/10 ---
--- Fold 8/10 ---
--- Fold 9/10 ---
--- Fold 10/10 ---

--- Training Meta-Model ---
Overall Stacked CV RMSE: 0.10848

--- Submission Complete ---
Submission file 'submission.csv' created successfully.
     Id      SalePrice
0  1461  122959.377239
1  1462  160642.627804
2  1463  183427.451442
3  1464  195920.706863
4  1465  188458.295944
