In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor


In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
y = train_df["SalePrice"]
X = train_df.drop(["SalePrice"], axis=1)

In [5]:
combined = pd.concat([X, test_df], axis=0)

In [6]:
# Fill numeric missing values with median
for col in combined.select_dtypes(include=["int64", "float64"]).columns:
    combined[col] = combined[col].fillna(combined[col].median())

# Fill categorical missing values with 'None'
for col in combined.select_dtypes(include=["object"]).columns:
    combined[col] = combined[col].fillna("None")


In [7]:
label_encoders = {}

for col in combined.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
    label_encoders[col] = le

In [8]:
X_processed = combined.iloc[:len(train_df), :]
X_test_processed = combined.iloc[len(train_df):, :]

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, np.log(y), test_size=0.2, random_state=42
)

In [10]:
model = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

model.fit(X_train, y_train)

In [11]:
val_preds = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print("Validation RMSE:", rmse)


Validation RMSE: 0.13829574298266367


In [12]:
model.fit(X_processed, np.log(y))

In [13]:
test_preds = np.exp(model.predict(X_test_processed))

In [14]:
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created successfully!")

submission.csv created successfully!
