In [30]:
 ## Load processed data (X, y, X_test)

import joblib
import numpy as np
import pandas as pd  

X       = joblib.load('../data/X.joblib')
y       = joblib.load('../data/y.joblib')
X_test  = joblib.load('../data/X_test.joblib')
test_ids = joblib.load('../data/test_ids.joblib')

print("X shape     :", X.shape)
print("y shape     :", y.shape)
print("X_test shape:", X_test.shape)
print("test_ids len:", len(test_ids))

X shape     : (20598, 648)
y shape     : (20598,)
X_test shape: (4750, 648)
test_ids len: 4750


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape


((16478, 648), (4120, 648), (16478,), (4120,))

In [32]:
from sklearn.metrics import mean_squared_error
import numpy as np

def rmsle_from_log(y_true_log, y_pred_log):
    """
    y_true_log ve y_pred_log: log1p(price)
    """
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log))


In [33]:
def evaluate_model(model, X_tr, y_tr, X_val, y_val):
    X_tr_np = np.asarray(X_tr)
    X_val_np = np.asarray(X_val)

    model.fit(X_tr_np, y_tr)
    y_pred_val = model.predict(X_val_np)
    score = rmsle_from_log(y_val, y_pred_val)
    print(f"Validation RMSLE (log space): {score:.5f}")
    return score


In [34]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
evaluate_model(linreg, X_train, y_train, X_valid, y_valid)


Validation RMSLE (log space): 0.54573


np.float64(0.5457285919719191)

In [35]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    n_jobs=-1,
    random_state=42
)

evaluate_model(rf, X_train, y_train, X_valid, y_valid)


Validation RMSLE (log space): 0.46350


np.float64(0.46349694579058415)

In [36]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

evaluate_model(gbr, X_train, y_train, X_valid, y_valid)


Validation RMSLE (log space): 0.48617


np.float64(0.48617396604061125)

In [26]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="rmse",
    random_state=42
)

evaluate_model(xgb, X_train, y_train, X_valid, y_valid)


Validation RMSLE (log space): 0.43614


np.float64(0.43614375007619105)

In [37]:
import re

def clean_feature_names(df):
    clean_cols = {}
    for c in df.columns:
        new_c = c
        # Remove/replace problematic characters
        new_c = re.sub(r'[^A-Za-z0-9_]+', '_', new_c)
        clean_cols[c] = new_c
    df = df.rename(columns=clean_cols)
    return df

# Apply to X, X_train, X_valid, X_test
X         = clean_feature_names(X)
X_train   = clean_feature_names(X_train)
X_valid   = clean_feature_names(X_valid)
X_test    = clean_feature_names(X_test)


In [39]:
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgb.fit(np.asarray(X), y)
evaluate_model(lgb, X_train, y_train, X_valid, y_valid)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4530
[LightGBM] [Info] Number of data points in the train set: 20598, number of used features: 166
[LightGBM] [Info] Start training from score 7.810944
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4423
[LightGBM] [Info] Number of data points in the train set: 16478, number of used features: 157
[LightGBM] [Info] Start training from score 7.812847
Validation RMSLE (log space): 0.42567




np.float64(0.42566923988279015)

In [44]:
# Final model
lgb_final = LGBMRegressor(
    n_estimators=800,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train on all available training data
lgb_final.fit(np.asarray(X), y)

# Predict in log-price space
y_test_log_pred = lgb_final.predict(np.asarray(X_test))

# Convert log1p(price) back to price
y_test_pred = np.expm1(y_test_log_pred)

# Remove negative predictions if any
y_test_pred = np.clip(y_test_pred, 0, None)

# Check sizes
len(y_test_pred), len(test_ids)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4530
[LightGBM] [Info] Number of data points in the train set: 20598, number of used features: 166
[LightGBM] [Info] Start training from score 7.810944




(4750, 4750)

In [46]:
import pandas as pd

# 1. Load the original test IDs in the correct order
df_test_original = pd.read_csv('../data/test.csv', usecols=['id'])
test_ids = df_test_original['id']

print(f"ID type: {type(test_ids)}")
print(f"ID shape: {test_ids.shape}")

# 2. Build the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_ids,
    'price': y_test_pred
})

# 3. Save as CSV
filename = 'submission_lightgbm_final.csv'
submission_df.to_csv(filename, index=False)

print(f"Submission file saved successfully: {filename}")
display(submission_df.head())


ID type: <class 'pandas.core.series.Series'>
ID shape: (4750,)
Submission file saved successfully: submission_lightgbm_final.csv


Unnamed: 0,id,price
0,536526,2030.742039
1,124137,4074.126222
2,164216,3767.660404
3,541629,3742.180347
4,572504,961.121108
