In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import skew
import optuna

# --- (Data Loading and Preprocessing functions are unchanged) ---
def load_data():
    """Loads the training and testing datasets."""
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print("Data loaded successfully.")
        return train_df, test_df
    except FileNotFoundError:
        return None, None

def preprocess(train_df, test_df):
    """Handles missing values, feature engineering, and normalization."""
    print("Starting improved preprocessing...")
    test_ids = test_df['Id']
    train_df = train_df.drop('Id', axis=1)
    test_df = test_df.drop('Id', axis=1)

    train_df = train_df.drop(train_df[(train_df['GrLivArea']>4000) & (train_df['SalePrice']<300000)].index)
    train_df['SalePrice'] = np.log1p(train_df['SalePrice'])
    y = train_df['SalePrice']
    all_data = pd.concat((train_df.drop('SalePrice', axis=1), test_df))

    # Fill missing values
    for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType',
                'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
        all_data[col] = all_data[col].fillna('None')
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
                'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
        all_data[col] = all_data[col].fillna(0)
    for col in ('MSZoning', 'Utilities', 'Functional', 'Exterior1st', 'Exterior2nd',
                'KitchenQual', 'SaleType', 'Electrical'):
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))

    # Feature Engineering
    all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
    all_data['Total_Bathrooms'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                                   all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])
    all_data['YearBuilt_Age'] = all_data['YrSold'] - all_data['YearBuilt']
    all_data['YearRemod_Age'] = all_data['YrSold'] - all_data['YearRemodAdd']
    all_data['OverallQual_sq'] = all_data['OverallQual']**2
    all_data['TotalSF_sq'] = all_data['TotalSF']**2
    all_data['OverallQual_x_TotalSF'] = all_data['OverallQual'] * all_data['TotalSF']

    # Skewness Transform
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.75].index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    
    all_data = pd.get_dummies(all_data)

    X = all_data[:len(y)]
    X_test_competition = all_data[len(y):]

    # Normalize
    scaler = StandardScaler()
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
    X_test_competition = pd.DataFrame(scaler.transform(X_test_competition), index=X_test_competition.index, columns=X_test_competition.columns)
    
    return X, y, X_test_competition, test_ids

# --- 3. Hyperparameter Optimization (MODIFIED for more regularization) ---
def optimize_lgbm(trial, X, y):
    params = {
        'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.03),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.01, 20.0, log=True), # Increased lower bound
        'lambda_l2': trial.suggest_float('lambda_l2', 0.01, 20.0, log=True), # Increased lower bound
        'num_leaves': trial.suggest_int('num_leaves', 20, 50), # Reduced upper bound
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.9),
        'bagging_freq': 1,
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50), # Increased lower bound
        'verbose': -1, 'n_jobs': -1, 'seed': 42
    }
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', callbacks=[lgb.early_stopping(50, verbose=False)])
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

# --- 4. Main Stacking and Submission Function (MODIFIED for more folds) ---
def stacking_and_predict(X, y, X_test_competition, test_ids, lgb_params, xgb_params):
    print("\n--- Starting Final Stacking Ensemble with More Folds ---")
    kf = KFold(n_splits=15, shuffle=True, random_state=42) # Increased folds to 15

    # Placeholders for predictions
    oof_preds_lgb = np.zeros(X.shape[0])
    oof_preds_xgb = np.zeros(X.shape[0])
    test_preds_lgb = np.zeros(X_test_competition.shape[0])
    test_preds_xgb = np.zeros(X_test_competition.shape[0])

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f"--- Fold {fold+1}/15 ---")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        lgb_model = lgb.LGBMRegressor(**lgb_params)
        lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
        oof_preds_lgb[val_index] = lgb_model.predict(X_val)
        test_preds_lgb += lgb_model.predict(X_test_competition) / kf.n_splits

        # Use fixed, robust XGBoost params as it has environment issues
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror', eval_metric='rmse', n_estimators=2000,
            learning_rate=0.01, max_depth=4, subsample=0.8,
            colsample_bytree=0.7, reg_alpha=0.01, reg_lambda=0.1,
            random_state=42, n_jobs=-1
        )
        try:
            xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
        except TypeError:
            xgb_model.fit(X_train, y_train) # Fallback for your environment

        oof_preds_xgb[val_index] = xgb_model.predict(X_val)
        test_preds_xgb += xgb_model.predict(X_test_competition) / kf.n_splits

    # Create meta-features
    X_meta_train = pd.DataFrame({'lgb': oof_preds_lgb, 'xgb': oof_preds_xgb})
    X_meta_test = pd.DataFrame({'lgb': test_preds_lgb, 'xgb': test_preds_xgb})

    print("\n--- Training Meta-Model ---")
    meta_model = ElasticNetCV(cv=5, random_state=42)
    meta_model.fit(X_meta_train, y)
    
    oof_stacked_preds = meta_model.predict(X_meta_train)
    stacked_rmse = np.sqrt(mean_squared_error(y, oof_stacked_preds))
    print(f"Overall Stacked CV RMSE: {stacked_rmse:.5f}")

    # Make final predictions
    final_predictions_log = meta_model.predict(X_meta_test)
    final_predictions = np.expm1(final_predictions_log)

    # Create submission file
    submission = pd.DataFrame({'Id': test_ids, 'SalePrice': final_predictions})
    submission.to_csv('submission.csv', index=False)
    
    print("\n--- Submission Complete ---")
    print(submission.head())

# --- Main Execution ---
if __name__ == '__main__':
    train_df, test_df = load_data()
    if train_df is not None and test_df is not None:
        X, y, X_test_competition, test_ids = preprocess(train_df, test_df)
        
        # STAGE 1: Hyperparameter Optimization for LightGBM
        print("\n--- Stage 1: Optimizing Hyperparameters for LightGBM ---")
        lgbm_study = optuna.create_study(direction='minimize')
        lgbm_study.optimize(lambda trial: optimize_lgbm(trial, X, y), n_trials=50)
        best_lgb_params = lgbm_study.best_params
        best_lgb_params['n_estimators'] = 3000 # Increase estimators for final run
        best_lgb_params['verbose'] = -1
        best_lgb_params['n_jobs'] = -1
        best_lgb_params['seed'] = 42
        print(f"Best LightGBM Params Found: {best_lgb_params}")

        # STAGE 2: Run Stacking
        # We use a fixed, robust XGBoost model due to environment issues and tune LGBM
        stacking_and_predict(X, y, X_test_competition, test_ids, best_lgb_params, None)

[I 2025-09-21 17:56:00,689] A new study created in memory with name: no-name-2b1445e6-4805-47bc-9935-12b388821e20


Data loaded successfully.
Starting improved preprocessing...

--- Stage 1: Optimizing Hyperparameters for LightGBM ---


[I 2025-09-21 17:56:01,970] Trial 0 finished with value: 0.1280662933317549 and parameters: {'learning_rate': 0.027921887246167586, 'lambda_l1': 0.03928927197758212, 'lambda_l2': 1.982061118526043, 'num_leaves': 31, 'feature_fraction': 0.7255741665678893, 'bagging_fraction': 0.736385356903716, 'min_child_samples': 29}. Best is trial 0 with value: 0.1280662933317549.
[I 2025-09-21 17:56:03,721] Trial 1 finished with value: 0.13092726403757762 and parameters: {'learning_rate': 0.013204281154950922, 'lambda_l1': 0.904641021223619, 'lambda_l2': 14.729954629102162, 'num_leaves': 33, 'feature_fraction': 0.8755002799993742, 'bagging_fraction': 0.6037624154910486, 'min_child_samples': 34}. Best is trial 0 with value: 0.1280662933317549.
[I 2025-09-21 17:56:05,847] Trial 2 finished with value: 0.13047906773794737 and parameters: {'learning_rate': 0.012885768120485919, 'lambda_l1': 3.2283815250373595, 'lambda_l2': 2.6899901376965025, 'num_leaves': 46, 'feature_fraction': 0.8568519661406544, 'bag

Best LightGBM Params Found: {'learning_rate': 0.017743378249939056, 'lambda_l1': 0.014210086356194328, 'lambda_l2': 0.04551277245867796, 'num_leaves': 47, 'feature_fraction': 0.5442289379185887, 'bagging_fraction': 0.743825902609032, 'min_child_samples': 17, 'n_estimators': 3000, 'verbose': -1, 'n_jobs': -1, 'seed': 42}

--- Starting Final Stacking Ensemble with More Folds ---
--- Fold 1/15 ---
--- Fold 2/15 ---
--- Fold 3/15 ---
--- Fold 4/15 ---
--- Fold 5/15 ---
--- Fold 6/15 ---
--- Fold 7/15 ---
--- Fold 8/15 ---
--- Fold 9/15 ---
--- Fold 10/15 ---
--- Fold 11/15 ---
--- Fold 12/15 ---
--- Fold 13/15 ---
--- Fold 14/15 ---
--- Fold 15/15 ---

--- Training Meta-Model ---
Overall Stacked CV RMSE: 0.11462

--- Submission Complete ---
     Id      SalePrice
0  1461  124099.107503
1  1462  162556.675566
2  1463  181032.675689
3  1464  192295.431640
4  1465  182103.821723
