In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import skew
import optuna

# --- 1. Data Loading ---
def load_data():
    """Loads the training and testing datasets."""
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print("Data loaded successfully.")
        return train_df, test_df
    except FileNotFoundError:
        return None, None

# --- 2. Preprocessing and Feature Engineering ---
def preprocess(train_df, test_df):
    """Handles missing values, feature engineering, and normalization."""
    print("Starting improved preprocessing...")
    test_ids = test_df['Id']
    train_df = train_df.drop('Id', axis=1)
    test_df = test_df.drop('Id', axis=1)

    train_df = train_df.drop(train_df[(train_df['GrLivArea']>4000) & (train_df['SalePrice']<300000)].index)
    train_df['SalePrice'] = np.log1p(train_df['SalePrice'])
    y = train_df['SalePrice']
    all_data = pd.concat((train_df.drop('SalePrice', axis=1), test_df))

    # Fill missing values
    for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType',
                'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
        all_data[col] = all_data[col].fillna('None')
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
                'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
        all_data[col] = all_data[col].fillna(0)
    for col in ('MSZoning', 'Utilities', 'Functional', 'Exterior1st', 'Exterior2nd',
                'KitchenQual', 'SaleType', 'Electrical'):
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))

    # Feature Engineering
    all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
    all_data['Total_Bathrooms'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                                   all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])
    all_data['Total_Porch_SF'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                                  all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                                  all_data['WoodDeckSF'])
    all_data['YearBuilt_Age'] = all_data['YrSold'] - all_data['YearBuilt']
    all_data['YearRemod_Age'] = all_data['YrSold'] - all_data['YearRemodAdd']
    all_data['OverallQual_sq'] = all_data['OverallQual']**2
    all_data['TotalSF_sq'] = all_data['TotalSF']**2
    all_data['GrLivArea_sq'] = all_data['GrLivArea']**2
    all_data['OverallQual_x_TotalSF'] = all_data['OverallQual'] * all_data['TotalSF']
    all_data['GrLivArea_x_OverallQual'] = all_data['GrLivArea'] * all_data['OverallQual']

    # Skewness Transform
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.75].index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    
    all_data = pd.get_dummies(all_data)

    X = all_data[:len(y)]
    X_test_competition = all_data[len(y):]

    # Normalize
    scaler = StandardScaler()
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
    X_test_competition = pd.DataFrame(scaler.transform(X_test_competition), index=X_test_competition.index, columns=X_test_competition.columns)
    
    return X, y, X_test_competition, test_ids

# --- 3. Hyperparameter Optimization with Optuna ---
def optimize_lgbm(trial, X, y):
    params = {
        'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'verbose': -1, 'n_jobs': -1, 'seed': 42
    }
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', callbacks=[lgb.early_stopping(50, verbose=False)])
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

def optimize_xgb(trial, X, y):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000), # Let Optuna choose n_estimators
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'n_jobs': -1, 'seed': 42
    }
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model = xgb.XGBRegressor(**params)
    # MODIFICATION: Removed early_stopping_rounds due to persistent environment error
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

# --- 4. Main Stacking and Submission Function ---
def stacking_and_predict(X, y, X_test_competition, test_ids, lgb_params, xgb_params):
    print("\n--- Starting Final Stacking Ensemble with Optimized Parameters---")
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    oof_preds_lgb = np.zeros(X.shape[0])
    oof_preds_xgb = np.zeros(X.shape[0])
    oof_preds_cat = np.zeros(X.shape[0])
    test_preds_lgb = np.zeros(X_test_competition.shape[0])
    test_preds_xgb = np.zeros(X_test_competition.shape[0])
    test_preds_cat = np.zeros(X_test_competition.shape[0])

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f"--- Fold {fold+1}/10 ---")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        lgb_model = lgb.LGBMRegressor(**lgb_params)
        lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
        oof_preds_lgb[val_index] = lgb_model.predict(X_val)
        test_preds_lgb += lgb_model.predict(X_test_competition) / kf.n_splits

        xgb_model = xgb.XGBRegressor(**xgb_params)
        # MODIFICATION: Removed early_stopping_rounds
        xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
        oof_preds_xgb[val_index] = xgb_model.predict(X_val)
        test_preds_xgb += xgb_model.predict(X_test_competition) / kf.n_splits
        
        cat_model = CatBoostRegressor(iterations=2000, verbose=0, early_stopping_rounds=50, loss_function='RMSE', random_seed=42)
        cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))
        oof_preds_cat[val_index] = cat_model.predict(X_val)
        test_preds_cat += cat_model.predict(X_test_competition) / kf.n_splits

    X_meta_train = pd.DataFrame({'lgb': oof_preds_lgb, 'xgb': oof_preds_xgb, 'cat': oof_preds_cat})
    X_meta_test = pd.DataFrame({'lgb': test_preds_lgb, 'xgb': test_preds_xgb, 'cat': test_preds_cat})

    print("\n--- Training Meta-Model ---")
    meta_model = ElasticNetCV(cv=5, random_state=42)
    meta_model.fit(X_meta_train, y)
    
    oof_stacked_preds = meta_model.predict(X_meta_train)
    stacked_rmse = np.sqrt(mean_squared_error(y, oof_stacked_preds))
    print(f"Overall Stacked CV RMSE: {stacked_rmse:.5f}")

    final_predictions_log = meta_model.predict(X_meta_test)
    final_predictions = np.expm1(final_predictions_log)

    submission = pd.DataFrame({'Id': test_ids, 'SalePrice': final_predictions})
    submission.to_csv('submission.csv', index=False)
    
    print("\n--- Submission Complete ---")
    print("Submission file 'submission.csv' created successfully.")
    print(submission.head())

# --- Main Execution ---
if __name__ == '__main__':
    train_df, test_df = load_data()
    if train_df is not None and test_df is not None:
        X, y, X_test_competition, test_ids = preprocess(train_df, test_df)
        
        print("\n--- Stage 1: Optimizing Hyperparameters with Optuna ---")
        
        print("\nOptimizing LightGBM...")
        lgbm_study = optuna.create_study(direction='minimize')
        lgbm_study.optimize(lambda trial: optimize_lgbm(trial, X, y), n_trials=25) # Reduced trials for speed
        best_lgb_params = lgbm_study.best_params
        best_lgb_params['n_estimators'] = 2000
        best_lgb_params['verbose'] = -1
        best_lgb_params['n_jobs'] = -1
        best_lgb_params['seed'] = 42
        print(f"Best LightGBM Params Found: {best_lgb_params}")

        print("\nOptimizing XGBoost...")
        xgb_study = optuna.create_study(direction='minimize')
        # MODIFICATION: Reduced n_trials for speed, since early stopping is disabled
        xgb_study.optimize(lambda trial: optimize_xgb(trial, X, y), n_trials=15) 
        best_xgb_params = xgb_study.best_params
        best_xgb_params['n_jobs'] = -1
        best_xgb_params['seed'] = 42
        print(f"Best XGBoost Params Found: {best_xgb_params}")
        
        stacking_and_predict(X, y, X_test_competition, test_ids, best_lgb_params, best_xgb_params)

[I 2025-09-21 17:47:09,532] A new study created in memory with name: no-name-4cffa88c-6208-485e-8ea4-9af2dc811297


Data loaded successfully.
Starting improved preprocessing...

--- Stage 1: Optimizing Hyperparameters with Optuna ---

Optimizing LightGBM...


[I 2025-09-21 17:47:12,703] Trial 0 finished with value: 0.13014499497422152 and parameters: {'learning_rate': 0.009522805662926726, 'lambda_l1': 1.8659325751746594e-08, 'lambda_l2': 0.06737190377357355, 'num_leaves': 100, 'feature_fraction': 0.9587287102354394, 'bagging_fraction': 0.6128349067020851, 'bagging_freq': 6, 'min_child_samples': 25}. Best is trial 0 with value: 0.13014499497422152.
[I 2025-09-21 17:47:15,587] Trial 1 finished with value: 0.13019188623233222 and parameters: {'learning_rate': 0.04110241235912489, 'lambda_l1': 2.6827373081626433, 'lambda_l2': 2.435178881979817, 'num_leaves': 78, 'feature_fraction': 0.8629297889358947, 'bagging_fraction': 0.9318669982951457, 'bagging_freq': 3, 'min_child_samples': 22}. Best is trial 0 with value: 0.13014499497422152.
[I 2025-09-21 17:47:17,288] Trial 2 finished with value: 0.13224016505058742 and parameters: {'learning_rate': 0.030435983684446943, 'lambda_l1': 3.2160886691702995e-07, 'lambda_l2': 7.209411087602624e-07, 'num_lea

Best LightGBM Params Found: {'learning_rate': 0.01363970940339849, 'lambda_l1': 0.022459818133879955, 'lambda_l2': 0.04381030772057137, 'num_leaves': 88, 'feature_fraction': 0.8744013634331522, 'bagging_fraction': 0.7883799414516603, 'bagging_freq': 7, 'min_child_samples': 5, 'n_estimators': 2000, 'verbose': -1, 'n_jobs': -1, 'seed': 42}

Optimizing XGBoost...


[I 2025-09-21 17:48:46,854] Trial 0 finished with value: 0.13107895432995897 and parameters: {'n_estimators': 1061, 'learning_rate': 0.02724976352802264, 'lambda': 7.911429114896773e-07, 'alpha': 0.02387269325977189, 'max_depth': 8, 'subsample': 0.6333662593480134, 'colsample_bytree': 0.7851860309277903, 'min_child_weight': 8}. Best is trial 0 with value: 0.13107895432995897.
[I 2025-09-21 17:48:48,026] Trial 1 finished with value: 0.1204870605818815 and parameters: {'n_estimators': 892, 'learning_rate': 0.019580361247763918, 'lambda': 0.0009358113631177011, 'alpha': 7.717397233417971e-07, 'max_depth': 4, 'subsample': 0.6825062865734908, 'colsample_bytree': 0.6394373067466941, 'min_child_weight': 5}. Best is trial 1 with value: 0.1204870605818815.
[I 2025-09-21 17:48:49,701] Trial 2 finished with value: 0.12744174754129373 and parameters: {'n_estimators': 1226, 'learning_rate': 0.02983068472314354, 'lambda': 4.699907237914144, 'alpha': 1.45010861804612e-05, 'max_depth': 4, 'subsample':

Best XGBoost Params Found: {'n_estimators': 1033, 'learning_rate': 0.04233718891556497, 'lambda': 9.253921935324589e-07, 'alpha': 0.027041542303584815, 'max_depth': 3, 'subsample': 0.9029149951483717, 'colsample_bytree': 0.9569124626237102, 'min_child_weight': 2, 'n_jobs': -1, 'seed': 42}

--- Starting Final Stacking Ensemble with Optimized Parameters---
--- Fold 1/10 ---
--- Fold 2/10 ---
--- Fold 3/10 ---
--- Fold 4/10 ---
--- Fold 5/10 ---
--- Fold 6/10 ---
--- Fold 7/10 ---
--- Fold 8/10 ---
--- Fold 9/10 ---
--- Fold 10/10 ---

--- Training Meta-Model ---
Overall Stacked CV RMSE: 0.11259

--- Submission Complete ---
Submission file 'submission.csv' created successfully.
     Id      SalePrice
0  1461  123049.885827
1  1462  161340.383659
2  1463  179879.051712
3  1464  193523.175909
4  1465  183813.606115
