In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
# Import and configuration

# import numpy as np
# import pandas as pd
import xgboost as xgb
import joblib
from pathlib import Path
from xgboost import XGBRegressor
from IPython.display import display
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

from house_pricing_preprocessing import (
    load_data,
    create_features,
    score_dataset
)

## Load data
print("Loading data...")
df_train, df_test = load_data()
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

df_train_cat_cols = df_train.select_dtypes(include=['object', 'category']).columns

#2. Creating features
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, "SalePrice"]
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

# 3. Score with cross-validation (before full training)
print("\nEvaluating with cross-validation...")
score = score_dataset(X_train.copy(), y_train)
print(f"Cross-validation RMSLE: {score:.5f}")

# Label encode for XGBoost
X_train_encoded = X_train.copy()
for col in X_train_encoded.select_dtypes(["category"]):
    X_train_encoded[col] = X_train_encoded[col].cat.codes

# FIX: Fill NaN values for models that don't handle them
X_train_encoded = X_train_encoded.fillna(-1)

# 4. Train multiple models and blend predictions
print("\nTraining ensemble...")
# Model 4.1: XGBoost
xgb_model = XGBRegressor(
    n_estimators=1500, learning_rate=0.03, max_depth=4,
    min_child_weight=3, subsample=0.7, colsample_bytree=0.7,
    reg_alpha=0.1, reg_lambda=1.0, random_state=42, n_jobs=-1
)
xgb_model.fit(X_train_encoded, np.log(y_train))

# Model 4.2: Gradient Boosting
gb_model = GradientBoostingRegressor(
    n_estimators=1000, learning_rate=0.03, max_depth=4,
    min_samples_leaf=15, random_state=42
)
gb_model.fit(X_train_encoded, np.log(y_train))

# Model 4.3: Ridge (needs scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)

ridge_model = Ridge(alpha=10)
ridge_model.fit(X_train_scaled, np.log(y_train))

print("Model trained!")

# 5. Blend predictions (weighted average)
print("\nMaking predictions...")
X_test_encoded = X_test.copy()
for col in X_test_encoded.select_dtypes(["category"]):
    X_test_encoded[col] = X_test_encoded[col].cat.codes

# FIX: Fill NaN values for models that don't handle them
X_test_encoded = X_test_encoded.fillna(-1)

X_test_scaled = scaler.transform(X_test_encoded)

pred_xgb = xgb_model.predict(X_test_encoded)
pred_gb = gb_model.predict(X_test_encoded)
pred_ridge = ridge_model.predict(X_test_scaled)

# Weighted blend
final_predictions = np.exp(0.5 * pred_xgb + 0.3 * pred_gb + 0.2 * pred_ridge)

# 6. Create submission file
submission = pd.DataFrame({
    "Id": X_test.index,
    "SalePrice": final_predictions
})

submission.to_csv("submission.csv", index=False)
print(f"\nSubmission saved! Shape: {submission.shape}")
print(submission.head())

# 7. Save all models together
# print("\nSaving models...")

# # Option 1: Save as a dictionary (RECOMMENDED)
# ensemble = {
#     "xgb_model": xgb_model,
#     "gb_model": gb_model,
#     "ridge_model": ridge_model,
#     "scaler": scaler,
#     "weights": {"xgb": 0.5, "gb": 0.3, "ridge": 0.2},
#     "feature_names": X_train_encoded.columns.tolist(),
# }
# joblib.dump(ensemble, "house_pricing_prediction/ensemble_model.joblib")
# print("Ensemble saved to ensemble_model.joblib")

# # Option 2: Save models separately
# joblib.dump(xgb_model, "house_pricing_prediction/xgb_model.joblib")
# joblib.dump(gb_model, "house_pricing_prediction/gb_model.joblib")
# joblib.dump(ridge_model, "house_pricing_prediction/ridge_model.joblib")
# joblib.dump(scaler, "house_pricing_prediction/scaler.joblib")
# print("Individual models saved!")

Loading data...
Train shape: (1460, 80)
Test shape: (1459, 80)
X_train shape: (1460, 129)
X_test shape: (1459, 129)

Evaluating with cross-validation...
Cross-validation RMSLE: 0.13455

Training ensemble...
Model trained!

Making predictions...

Submission saved! Shape: (1459, 2)
     Id      SalePrice
0  1461  119897.295867
1  1462  163718.351397
2  1463  182901.372399
3  1464  200457.324880
4  1465  191656.467923
