## 1. Imports

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

## 2. Data Loading

In [13]:
train_df = pd.read_csv('/content/data/train.csv')
test_df = pd.read_csv('/content/data/test.csv')
train_df.shape, test_df.shape

((1460, 81), (1459, 80))

## 3. Data Preprocessing

In [14]:
# Combine train and test data for unified preprocessing
train_df['source'] = 'train'
test_df['source'] = 'test'
combined = pd.concat([train_df, test_df], ignore_index=True)
combined.drop(['Id'], axis=1, inplace=True)

In [15]:
# Handle missing values (example with common strategies)
num_cols = combined.select_dtypes(include=['int64', 'float64']).columns
cat_cols = combined.select_dtypes(include=['object']).columns

# Fill numeric with median, categorical with mode
for col in num_cols:
    combined[col].fillna(combined[col].median(), inplace=True)
for col in cat_cols:
    combined[col].fillna(combined[col].mode()[0], inplace=True)

## 4. Feature Engineering

In [16]:
# Example feature engineering: total area
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']

## 5. Encoding and Scaling

In [23]:
# The feature lists will be redefined later, right before the preprocessor is
# created. %% Redefine numeric and categorical after engineering

In [24]:
# Redefine numeric and categorical after engineering
num_features = combined.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = combined.select_dtypes(include=['object']).columns.tolist()
num_features.remove('SalePrice') if 'SalePrice' in num_features else None
cat_features.remove('source') if 'source' in cat_features else None

# Column transformer setup
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

## 6. Splitting Data

In [19]:
# Re-split the data
train_processed = combined[combined['source'] == 'train'].drop(['source'], axis=1)
test_processed = combined[combined['source'] == 'test'].drop(['source', 'SalePrice'], axis=1)

X = train_processed.drop('SalePrice', axis=1)
y = train_processed['SalePrice']

## 7. Model Training

In [25]:
# Pipeline and model definition
models = {
    'Ridge': Ridge(alpha=10),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42)
}
scores = {}
for name, model in models.items():
    pipe = make_pipeline(preprocessor, model)
    score = cross_val_score(pipe, X, y, scoring='neg_root_mean_squared_error', cv=5).mean()
    scores[name] = -score
scores

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3423
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 182
[LightGBM] [Info] Start training from score 180717.091610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3454
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 178
[LightGBM] [Info] Start training from score 180407.575342
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

{'Ridge': np.float64(31075.377025688922),
 'RandomForest': np.float64(29196.087134112684),
 'XGBoost': np.float64(28208.292799855513),
 'LightGBM': np.float64(28082.196440563053)}

## 8. Stacking and Final Model

In [21]:
# Use best models for stacking
stack = StackingRegressor(
    estimators=[
        ('xgb', XGBRegressor(n_estimators=100, random_state=42)),
        ('lgbm', lgb.LGBMRegressor(n_estimators=100, random_state=42))
    ],
    final_estimator=Ridge(alpha=10)
)
final_model = make_pipeline(preprocessor, stack)
final_model.fit(X, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3627
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 188
[LightGBM] [Info] Start training from score 180921.195890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000807 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3423
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 182
[LightGBM] [Info] Start training from score 180717.091610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

## 9. Submission

In [22]:
preds = final_model.predict(test_processed)
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': preds
})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,123868.187521
1,1462,152624.425819
2,1463,193069.159051
3,1464,193254.514059
4,1465,183191.936447
