In [189]:
!pip install lightgbm

import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error

import lightgbm as lgb




train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

y = np.log1p(train["SalePrice"])
X = train.drop("SalePrice", axis=1)



num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns
print(len(num_cols), len(cat_cols))




num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("power", PowerTransformer(method="yeo-johnson"))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])



lgb_model = lgb.LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.01,
    max_depth=-1,
    num_leaves=32,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)



model = Pipeline([
    ("preprocess", preprocessor),
    ("lgbm", lgb_model)
])





kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[val_idx]

    model.fit(X_tr, y_tr)
    preds = model.predict(X_va)

    rmse = np.sqrt(mean_squared_error(y_va, preds))
    rmse_scores.append(rmse)

    print(f"Fold {fold} RMSE: {rmse:.4f}")

print("\nMean CV RMSE:", np.mean(rmse_scores))



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\ASUS\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 11.6 MB/s  0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
37 43
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3483
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 194
[LightGBM] [Info] Start training from score 12.030658




Fold 1 RMSE: 0.1401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3482
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 197
[LightGBM] [Info] Start training from score 12.016898




Fold 2 RMSE: 0.1188
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001648 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3469
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 193
[LightGBM] [Info] Start training from score 12.022759




Fold 3 RMSE: 0.1626
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3476
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 195
[LightGBM] [Info] Start training from score 12.027933




Fold 4 RMSE: 0.1317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3477
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 192
[LightGBM] [Info] Start training from score 12.022040
Fold 5 RMSE: 0.1105

Mean CV RMSE: 0.13275524605368377




In [190]:
model.fit(X, y)
test_preds = np.expm1(model.predict(test))
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission_lightgbm.csv", index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3746
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 203
[LightGBM] [Info] Start training from score 12.024057


