# Prep

## Import libs

In [120]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
import pandas as pd

## Prep sets

In [88]:
df = pd.read_csv('train.csv')
X = df.drop(['Id', 'SalePrice'], axis=1)
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [89]:
print(df.shape)
[{col: [df[col].isnull().sum(), df[col].isnull().sum()/1460]} for col in df.columns if df[col].isnull().any()]

(1460, 81)


[{'LotFrontage': [259, 0.1773972602739726]},
 {'Alley': [1369, 0.9376712328767123]},
 {'MasVnrType': [872, 0.5972602739726027]},
 {'MasVnrArea': [8, 0.005479452054794521]},
 {'BsmtQual': [37, 0.025342465753424658]},
 {'BsmtCond': [37, 0.025342465753424658]},
 {'BsmtExposure': [38, 0.026027397260273973]},
 {'BsmtFinType1': [37, 0.025342465753424658]},
 {'BsmtFinType2': [38, 0.026027397260273973]},
 {'Electrical': [1, 0.0006849315068493151]},
 {'FireplaceQu': [690, 0.4726027397260274]},
 {'GarageType': [81, 0.05547945205479452]},
 {'GarageYrBlt': [81, 0.05547945205479452]},
 {'GarageFinish': [81, 0.05547945205479452]},
 {'GarageQual': [81, 0.05547945205479452]},
 {'GarageCond': [81, 0.05547945205479452]},
 {'PoolQC': [1453, 0.9952054794520548]},
 {'Fence': [1179, 0.8075342465753425]},
 {'MiscFeature': [1406, 0.963013698630137]}]

## Create pipelines

In [92]:
cats = [col for col in X.columns if X[col].dtype == 'object']
nums = [col for col in X.columns if X[col].dtype != 'object']

categorical = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
  ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='mean'))
])

print(X_train.columns)

preprocessor = ColumnTransformer(transformers=[
  ('cat', categorical, cats),
  ('num', numerical, nums),
])

boost_model = XGBRegressor()

my_pipeline = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('model', boost_model)
])


Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [113]:
X.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## Hyperparameter tuning

In [109]:
param_grid = {
  'model__n_estimators': [200, 500],
  'model__max_depth': [3, 5, 10],                     # Maximum depth of each tree (smaller range)
  'model__learning_rate': [0.05, 0.1],            # Shrinks the contribution of each tree
  'model__subsample': [0.5, 1.0],                 # Fraction of samples used to grow trees
  'model__colsample_bytree': [0.5, 1.0],          # Fraction of features used per tree
  'model__min_child_weight': [1, 5],              # Minimum sum of instance weights in a leaf
  'model__reg_alpha': [0, 0.1],                   # L1 regularization (helps with sparse data)
  'model__reg_lambda': [1.0, 2.0],                # L2 regularization (prevents overfitting)
}

# random_search = RandomizedSearchCV(estimator=my_pipeline, param_distributions=param_grid, n_iter=50, scoring='neg_mean_squared_error', cv=2, verbose=10, random_state=42, n_jobs=-1)
random_search = GridSearchCV(estimator=my_pipeline, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv=2, verbose=10, n_jobs=-1)
random_search.fit(X_train, y_train)

Fitting 2 folds for each of 384 candidates, totalling 768 fits


  _data = np.array(data, dtype=dtype, copy=copy,


# Model

In [124]:

best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)

print(root_mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

print(root_mean_squared_error(y_test, y_pred)/df['SalePrice'].mean())

22106.699485806068
0.9223085641860962
0.12218966040439351


# Submission

In [119]:
test_df = pd.read_csv('test.csv')

IDs = test_df['Id']

X_sub = test_df.drop(['Id'], axis=1)

y_sub = best_model.predict(X_sub)

submission = pd.DataFrame({'Id': IDs, 'SalePrice': y_sub})
submission

submission.to_csv('submission.csv', index=False)