In [29]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
from housing_prices import feature_engineering

In [41]:
subdirectory = Path('housing-prices')
df_train = pd.read_csv(subdirectory / 'train.csv')
df_submit = pd.read_csv(subdirectory / 'test.csv')
df_train_fe = feature_engineering(df_train)
df_submit_fe = feature_engineering(df_submit)
df_train.dtypes.value_counts()
df_train.select_dtypes(include=['float', 'float64', 'float32']).columns.tolist()

# Print the list

Feature engineering housing transactions: rows 1460, columns before 81, columns after: 136
Feature engineering housing transactions: rows 1459, columns before 80, columns after: 135


['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [40]:
X = df_train_fe.drop(['SalePrice','Id'], axis=1)
y = df_train_fe['SalePrice']
ids = df_train_fe['Id']

X_submit = df_submit_fe.drop(['Id'], axis=1)
ids_submit = df_submit_fe['Id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(exclude=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns),
        #('cat', OrdinalEncoder(), categorical_columns)
    ],
    remainder='passthrough'
)

fit_params = {}
if categorical_columns:
    # After preprocessing, categorical columns will be at positions after numerical columns
    # We need to indicate these positions to LightGBM
    categorical_indices = list(range(len(numerical_columns), len(numerical_columns) + len(categorical_columns)))
    fit_params = {
        'regressor__categorical_feature': categorical_indices
    }


feature_selector = RFECV(
    estimator=Ridge(alpha=1.0, random_state=42),
    step=0.1,
    cv=3,
    scoring='neg_mean_squared_error',
    min_features_to_select=5,
    n_jobs=1,
    verbose=0
)

lgbm_model = lgb.LGBMRegressor(
    objective='regression',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    importance_type='gain',
    verbose=-1,
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('regressor', lgbm_model)
])

param_distributions = {
    'regressor__n_estimators': [500],  # Fixed with early stopping
    'regressor__learning_rate': np.logspace(-3, -1, 10),  # Log scale from 0.001 to 0.1
    'regressor__num_leaves': np.arange(7, 127, 10),  # Powers of 2 minus 1 are optimal
    'regressor__max_depth': [-1, 5, 10, 15],
    'regressor__min_child_samples': [5, 10, 20, 50, 100],
    'regressor__subsample': np.linspace(0.6, 1.0, 5),
    'regressor__colsample_bytree': np.linspace(0.6, 1.0, 5),
    'regressor__reg_alpha': np.logspace(-3, 1, 5),  # Log scale for regularization
    'regressor__reg_lambda': np.logspace(-3, 1, 5),
}

grid_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=30,
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    return_train_score=True,
    refit=True
)

In [16]:
pipeline.fit(X_train, y_train, **fit_params)

In [17]:
%%capture captured_output
# grid_search.fit(X, y, **fit_params)

In [18]:
y_pred = pipeline.predict(X_test)
y_submit = pipeline.predict(X_submit)



In [19]:
# y_pred = grid_search.predict(X_test)
# y_submit = grid_search.predict(X_submit)

In [20]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error: 812550280.1914
Root Mean Squared Error: 28505.2676
R² Score: 0.8941


In [21]:
def name_feature_importances(feature_importances, feature_names):
    features_with_importance = list(zip(feature_names, feature_importances))
    print("No of features:", len(features_with_importance))
    return sorted(features_with_importance, key=lambda x: x[1], reverse=True)

In [22]:
#fe_imp = grid_search.best_estimator_.named_steps['regressor'].feature_importances_
fe_imp = pipeline.named_steps['regressor'].feature_importances_
name_feature_importances(fe_imp, X_train.columns)

No of features: 133


[('Neighborhood', np.float64(34798072367598.0)),
 ('BsmtFinSF1', np.float64(8879049500904.0)),
 ('BsmtFullBath', np.float64(5074181170792.0)),
 ('YearRemodAdd', np.float64(2721156800228.0)),
 ('YearBuilt', np.float64(2247454841758.0)),
 ('Utilities_has_gas', np.float64(1944880381714.0)),
 ('RoofStyle', np.float64(1546900212314.0)),
 ('LotArea', np.float64(1544715322552.0)),
 ('OverallQual', np.float64(1189570101040.0)),
 ('LandSlope_is_gentle', np.float64(1177810494412.0)),
 ('Utilities_has_sewer', np.float64(1083579012544.0)),
 ('BsmtHalfBath', np.float64(847182698680.0)),
 ('2ndFlrSF', np.float64(635909794992.0)),
 ('RoofMatl', np.float64(622893544754.0)),
 ('BldgType', np.float64(592451985330.0)),
 ('LandSlope_is_severe', np.float64(572175492432.0)),
 ('HalfBath', np.float64(519608735230.0)),
 ('GrLivArea', np.float64(440993153342.0)),
 ('LotFrontage', np.float64(395626034642.0)),
 ('OverallCond', np.float64(359984013832.0)),
 ('BsmtUnfSF', np.float64(345683977792.0)),
 ('LandSlope_

In [None]:
submission = pd.DataFrame({
    'Id': ids_submit,
    'SalePrice': y_submit
})
submission.to_csv(subdirectory / 'housing-submission.csv', index=False)