In [1]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from xgboost import XGBRegressor
import category_encoders as ce


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')


train_df.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = np.log(train_df['SalePrice'])  # Log transform target

for df in [train_df, test_df]:
    df['TotalSF'] = df['TotalBsmtSF'].fillna(0) + df['1stFlrSF'].fillna(0) + df['2ndFlrSF'].fillna(0)
    df['HasGarage'] = (df['GarageArea'].fillna(0) > 0).astype(int)

X = train_df.drop('SalePrice', axis=1)


low_card_cat_cols = [col for col in X.columns if X[col].dtype == "object" and X[col].nunique() < 10]
high_card_cat_cols = [col for col in X.columns if X[col].dtype == "object" and X[col].nunique() >= 10]
numerical_cols = [col for col in X.columns if X[col].dtype in ["int64", "float64"]] + ['TotalSF', 'HasGarage']
selected_cols = list(set(low_card_cat_cols + numerical_cols + high_card_cat_cols))
X = X[selected_cols].copy()


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=2)


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

low_card_cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

high_card_cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', ce.TargetEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('low_card_cat', low_card_cat_transformer, low_card_cat_cols),
    ('high_card_cat', high_card_cat_transformer, high_card_cat_cols)
])


for col in low_card_cat_cols + high_card_cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_valid[col] = X_valid[col].astype('category')

# Define the model with enable_categorical=True
model = XGBRegressor(random_state=2, enable_categorical=True, n_estimators=500, learning_rate=0.03, max_depth=3)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])


preprocessor.fit(X_train, y_train)
X_valid_transformed = preprocessor.transform(X_valid)


pipeline.fit(
    X_train, y_train,
    model__eval_set=[(X_valid_transformed, y_valid)],
    model__verbose=False
)


preds_log = pipeline.predict(X_valid)
preds = np.exp(preds_log)
actual = np.exp(y_valid)

mae = mean_absolute_error(actual, preds)
rmsle = np.sqrt(mean_squared_log_error(actual, preds))
print(f"Validation MAE: {mae:.2f}")
print(f"Validation RMSLE: {rmsle:.6f}")


test_X = test_df[selected_cols].copy()
test_preds = np.exp(pipeline.predict(test_X))


submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_preds
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")








Validation MAE: 15150.61
Validation RMSLE: 0.122890
Submission file created: submission.csv





**Finding best hyperparameters for the model**

In [3]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],  # You can adjust the range based on your results
    'learning_rate': [0.01, 0.03, 0.1],  # Experiment with a few values
    'max_depth': [3, 5, 7, 10]  # Limit the depth of the trees to avoid overfitting
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error',  # We want to minimize MAE
                           cv=5,  # 5-fold cross-validation
                           verbose=1, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best hyperparameters found:", grid_search.best_params_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best hyperparameters found: {'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 500}
