In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import skew

# Add a note for the user about required libraries
print("Note: This script uses XGBoost, Matplotlib, and Seaborn.")


# --- 1. Data Loading ---
def load_data():
    """Loads the training and testing datasets."""
    try:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print("Data loaded successfully.")
        return train_df, test_df
    except FileNotFoundError:
        print("Error: train.csv or test.csv not found.")
        print("Please ensure the data files are in the same directory as the script.")
        return None, None

# --- 2. Preprocessing and Feature Engineering ---
def preprocess(train_df, test_df):
    """Handles missing values, feature engineering, and normalization."""
    print("Starting improved preprocessing...")
    test_ids = test_df['Id']
    train_df = train_df.drop('Id', axis=1)
    test_df = test_df.drop('Id', axis=1)

    train_df = train_df.drop(train_df[(train_df['GrLivArea']>4000) & (train_df['SalePrice']<300000)].index)
    print(f"Removed outliers. New train shape: {train_df.shape}")

    train_df['SalePrice'] = np.log1p(train_df['SalePrice'])
    y = train_df['SalePrice']

    all_data = pd.concat((train_df.drop('SalePrice', axis=1), test_df))
    print(f"Combined data shape: {all_data.shape}")

    # Fill missing values...
    for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType',
                'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType'):
        all_data[col] = all_data[col].fillna('None')
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
                'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
        all_data[col] = all_data[col].fillna(0)
    for col in ('MSZoning', 'Utilities', 'Functional', 'Exterior1st', 'Exterior2nd',
                'KitchenQual', 'SaleType', 'Electrical'):
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))

    # Feature Engineering...
    print("Creating new features...")
    all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
    all_data['Total_Bathrooms'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                                   all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])
    all_data['Total_Porch_SF'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                                  all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                                  all_data['WoodDeckSF'])
    all_data['YearBuilt_Age'] = all_data['YrSold'] - all_data['YearBuilt']
    all_data['YearRemod_Age'] = all_data['YrSold'] - all_data['YearRemodAdd']
    all_data['haspool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    all_data['has2ndfloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    all_data['hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    all_data['hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    all_data['hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

    # Skewness Transform...
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.75].index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    print(f"Applied log transformation to {len(skewed_feats)} skewed features.")

    # One-Hot Encoding...
    all_data = pd.get_dummies(all_data)

    X = all_data[:len(y)]
    X_test_competition = all_data[len(y):]

    # Normalize...
    print("Normalizing data using StandardScaler...")
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    X_test_competition_scaled = scaler.transform(X_test_competition)
    X = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)
    X_test_competition = pd.DataFrame(X_test_competition_scaled, index=X_test_competition.index, columns=X_test_competition.columns)
    
    print(f"Preprocessing complete.")
    return X, y, X_test_competition, test_ids

# --- 3. Hyperparameter Tuning ---
def find_best_lambda(X, y):
    """Uses an 80-20 split to find the best L2 lambda."""
    # Create an 80-20 train-validation split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    print("\n--- Stage 1: Finding Best Lambda using 80-20 Split ---")
        
    lambdas = [0.2, 0.5, 0.7, 0.9, 1, 1.2, 1.5, 2, 3, 5]
    results = {}
    
    for l2_lambda in lambdas:
        # Train LightGBM Model
        lgb_model = lgb.LGBMRegressor(
            objective='regression', metric='rmse', n_estimators=2000,
            learning_rate=0.015, feature_fraction=0.8, bagging_fraction=0.8,
            bagging_freq=1, lambda_l1=0.1, lambda_l2=l2_lambda,
            num_leaves=50, min_data_in_leaf=20, max_depth=6,
            verbose=-1, n_jobs=-1, seed=42
        )
        lgb_model.fit(X_train, y_train, 
                      eval_set=[(X_val, y_val)],
                      eval_metric='rmse', 
                      callbacks=[lgb.early_stopping(50, verbose=False)])

        # Train XGBoost Model
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror', eval_metric='rmse', n_estimators=2000,
            learning_rate=0.015, max_depth=5, subsample=0.8,
            colsample_bytree=0.8, reg_lambda=l2_lambda, reg_alpha=0.1,
            min_child_weight=3, random_state=42, n_jobs=-1
        )
        # Using a fallback for your environment in case early stopping fails
        try:
            xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
        except TypeError:
            xgb_model.fit(X_train, y_train)

        # Evaluate on the validation set
        preds_lgb = lgb_model.predict(X_val)
        preds_xgb = xgb_model.predict(X_val)
        blended_preds = 0.7 * preds_lgb + 0.3 * preds_xgb
        
        results[l2_lambda] = np.sqrt(mean_squared_error(y_val, blended_preds))
        print(f"Blended Val RMSE for Lambda {l2_lambda}: {results[l2_lambda]:.5f}")
        
    # Find the best lambda
    best_lambda = min(results, key=results.get)
    print("\n--- Hyperparameter Tuning Complete ---")
    print(f"Best L2 Lambda found: {best_lambda} (Val RMSE: {results[best_lambda]:.5f})")
    
    return best_lambda

# --- 4. Final Training and Submission ---
def create_submission(X, y, X_test_competition, test_ids, best_lambda):
    """Retrains models on ALL data with the best lambda and creates a submission file."""
    print("\n--- Stage 2: Retraining on 100% of Data and Creating Submission ---")
    print(f"Using best L2 Lambda = {best_lambda} and 10-Fold Cross-Validation.")
    
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    lgb_predictions = np.zeros(X_test_competition.shape[0])
    xgb_predictions = np.zeros(X_test_competition.shape[0])

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f"--- Fold {fold+1}/10 ---")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # --- a) Final LightGBM Model ---
        lgb_model = lgb.LGBMRegressor(
            objective='regression', metric='rmse', n_estimators=2000,
            learning_rate=0.015, feature_fraction=0.8, bagging_fraction=0.8,
            bagging_freq=1, lambda_l1=0.1, lambda_l2=best_lambda, # Use best lambda
            num_leaves=50, min_data_in_leaf=20, max_depth=6,
            verbose=-1, n_jobs=-1, seed=42
        )
        lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                      eval_metric='rmse', callbacks=[lgb.early_stopping(50, verbose=False)])
        lgb_predictions += lgb_model.predict(X_test_competition) / kf.n_splits

        # --- b) Final XGBoost Model ---
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror', eval_metric='rmse', n_estimators=2000,
            learning_rate=0.015, max_depth=5, subsample=0.8,
            colsample_bytree=0.8, reg_lambda=best_lambda, reg_alpha=0.1, # Use best lambda
            min_child_weight=3, random_state=42, n_jobs=-1
        )
        # Using a fallback for your environment in case early stopping fails
        try:
            xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
        except TypeError:
            xgb_model.fit(X_train, y_train)
            
        xgb_predictions += xgb_model.predict(X_test_competition) / kf.n_splits

    # Blend final predictions
    blended_predictions = 0.7 * lgb_predictions + 0.3 * xgb_predictions
    final_predictions = np.expm1(blended_predictions)

    # Create submission dataframe
    submission = pd.DataFrame({'Id': test_ids, 'SalePrice': final_predictions})
    submission.to_csv('submission.csv', index=False)
    
    print("\n--- Submission Complete ---")
    print("Submission file 'submission.csv' created successfully.")
    print(submission.head())

# --- Main Execution ---
if __name__ == '__main__':
    train_df, test_df = load_data()
    if train_df is not None and test_df is not None:
        X, y, X_test_competition, test_ids = preprocess(train_df, test_df)
        
        # STAGE 1: Find the best hyperparameter using an 80-20 split
        best_l2_lambda = find_best_lambda(X, y)
        
        # STAGE 2: Use the best hyperparameter to retrain on all data and create the submission
        create_submission(X, y, X_test_competition, test_ids, best_l2_lambda)

Note: This script uses XGBoost, Matplotlib, and Seaborn.
Data loaded successfully.
Starting improved preprocessing...
Removed outliers. New train shape: (1458, 80)
Combined data shape: (2917, 79)
Creating new features...
Applied log transformation to 23 skewed features.
Normalizing data using StandardScaler...
Preprocessing complete.

--- Stage 1: Finding Best Lambda using 80-20 Split ---
Blended Val RMSE for Lambda 0.2: 0.12585
Blended Val RMSE for Lambda 0.5: 0.12630
Blended Val RMSE for Lambda 0.7: 0.12597
Blended Val RMSE for Lambda 0.9: 0.12567
Blended Val RMSE for Lambda 1: 0.12586
Blended Val RMSE for Lambda 1.2: 0.12583
Blended Val RMSE for Lambda 1.5: 0.12581
Blended Val RMSE for Lambda 2: 0.12634
Blended Val RMSE for Lambda 3: 0.12670
Blended Val RMSE for Lambda 5: 0.12679

--- Hyperparameter Tuning Complete ---
Best L2 Lambda found: 0.9 (Val RMSE: 0.12567)

--- Stage 2: Retraining on 100% of Data and Creating Submission ---
Using best L2 Lambda = 0.9 and 10-Fold Cross-Valida