In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Save test ID for submission
test_ID = test["Id"]
train.drop(["Id"], axis=1, inplace=True)
test.drop(["Id"], axis=1, inplace=True)

In [6]:
for col in train.columns:
    if train[col].dtype == "object":
        fill_value = train[col].mode()[0]          # most frequent category from train
    else:
        fill_value = train[col].median()           # numeric median from train

    # assign back to the DataFrame (avoid inplace on Series)
    train[col] = train[col].fillna(fill_value)
    if col in test.columns:
        test[col] = test[col].fillna(fill_value)

In [7]:
cat_cols = train.select_dtypes(include=["object"]).columns
le = LabelEncoder()
for col in cat_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])


In [8]:
X = train.drop("SalePrice", axis=1)
y = np.log1p(train["SalePrice"])

In [9]:
cat_model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.01,
    depth=7,
    l2_leaf_reg=3,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=500
)


In [11]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)
scores = -cross_val_score(cat_model, X, y, scoring="neg_root_mean_squared_error", cv=cv)

print("ðŸ’¡ CatBoost CV RMSE:", np.mean(scores))

0:	learn: 0.3914716	total: 17.5ms	remaining: 1m 27s
500:	learn: 0.1061977	total: 836ms	remaining: 7.51s
1000:	learn: 0.0809210	total: 1.65s	remaining: 6.61s
1500:	learn: 0.0662770	total: 2.42s	remaining: 5.65s
2000:	learn: 0.0550362	total: 3.19s	remaining: 4.78s
2500:	learn: 0.0458209	total: 3.96s	remaining: 3.95s
3000:	learn: 0.0382634	total: 4.81s	remaining: 3.2s
3500:	learn: 0.0321660	total: 5.56s	remaining: 2.38s
4000:	learn: 0.0272773	total: 6.33s	remaining: 1.58s
4500:	learn: 0.0232190	total: 7.08s	remaining: 785ms
4999:	learn: 0.0197896	total: 7.83s	remaining: 0us
0:	learn: 0.3939708	total: 2.58ms	remaining: 12.9s
500:	learn: 0.1047581	total: 776ms	remaining: 6.97s
1000:	learn: 0.0789604	total: 1.56s	remaining: 6.24s
1500:	learn: 0.0638494	total: 2.32s	remaining: 5.41s
2000:	learn: 0.0524888	total: 3.07s	remaining: 4.6s
2500:	learn: 0.0442301	total: 3.83s	remaining: 3.82s
3000:	learn: 0.0376054	total: 4.57s	remaining: 3.05s
3500:	learn: 0.0321335	total: 5.33s	remaining: 2.28s
40

In [12]:
cat_model.fit(X, y)

0:	learn: 0.3967110	total: 10.9ms	remaining: 54.4s
500:	learn: 0.1065860	total: 826ms	remaining: 7.41s
1000:	learn: 0.0820081	total: 1.59s	remaining: 6.37s
1500:	learn: 0.0670240	total: 2.43s	remaining: 5.67s
2000:	learn: 0.0567528	total: 3.38s	remaining: 5.07s
2500:	learn: 0.0480143	total: 4.16s	remaining: 4.16s
3000:	learn: 0.0412862	total: 4.95s	remaining: 3.3s
3500:	learn: 0.0353666	total: 5.74s	remaining: 2.46s
4000:	learn: 0.0302885	total: 6.51s	remaining: 1.63s
4500:	learn: 0.0263096	total: 7.3s	remaining: 809ms
4999:	learn: 0.0229336	total: 8.07s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x11d3b9370>

In [13]:
preds = cat_model.predict(test)
final_preds = np.expm1(preds)

In [14]:
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": final_preds
})

submission.to_csv("catboost_submission.csv", index=False)
print("âœ… Submission file created: catboost_submission.csv")

âœ… Submission file created: catboost_submission.csv
