# Import Libraries

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import cross_val_score, KFold
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_regression
from category_encoders import MEstimateEncoder
import optuna
import numpy as np

Let's set some defaults.

In [None]:
pd.set_option('display.max_rows', 80)
metric = 'RMSLE'

# Read and Preprocess Data

In [None]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
print(df_train.shape)
print(df_test.shape)

The data has 80 columns (plus the target column we're trying to predict - *SalePrice*).

We'll concatenate training and test data to preprocess them together.

In [None]:
df = pd.concat([df_train, df_test])

## Inspect Values of Categorical Columns

In [None]:
for col in df.select_dtypes(['object']):
    print(f'{col}:\n{df[col].unique()}\n')

The *Exterior2nd* column contains some typos. Let's fix them.

In [None]:
df['Exterior2nd'] = df['Exterior2nd'].replace({'Wd Shng': 'WdShing', 'CmentBd': 'CemntBd', 'Brk Cmn': 'BrkComm'})

There are values that are not consistent with the provided *data_description.txt* file. Let's fix these too.

In [None]:
df['MSZoning'] = df['MSZoning'].replace({'C (all)': 'C'})
df['Neighborhood'] = df['Neighborhood'].replace({'NAmes': 'Names'})
df['BldgType'] = df['BldgType'].replace({'2fmCon': '2FmCon', 'Duplex': 'Duplx', 'Twnhs': 'TwnhsI'})

## Inspect Values of Numerical Columns
Let's look at the range of numerical values to see if there are any problems.

In [None]:
df.select_dtypes('number').aggregate(['min', 'max', 'mean']).T

We notice that the *GarageYrBlt* column which describes the year the garage was built has impossible values (as high as 2207).

Let's replace those values with the year the house was built.

In [None]:
df['GarageYrBlt'] = df['GarageYrBlt'].where(df['GarageYrBlt'] <= 2010, df['YearBuilt'])

Finally, we compile all of these steps in a function for later reuse.

In [None]:
def clean(df):
    df['Exterior2nd'] = df['Exterior2nd'].replace({'Wd Shng': 'WdShing', 'CmentBd': 'CemntBd', 'Brk Cmn': 'BrkComm'})
    df['MSZoning'] = df['MSZoning'].replace({'C (all)': 'C'})
    df['Neighborhood'] = df['Neighborhood'].replace({'NAmes': 'Names'})
    df['BldgType'] = df['BldgType'].replace({'2fmCon': '2FmCon', 'Duplex': 'Duplx', 'Twnhs': 'TwnhsI'})
    df['GarageYrBlt'] = df['GarageYrBlt'].where(df['GarageYrBlt'] <= 2010, df['YearBuilt'])
    return df

## Encoding

3 of the numerical columns in the data (*MSSubClass*, *OverallQual*, and *OverallCond*) are actually categorical.

We define the nominal and ordinal features' categories and levels according to the data description.

In [None]:
# The nominal (unordered) categorical features
nominal_categories = {
    "MSSubClass": CategoricalDtype(categories=['None', 20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190]),
    "MSZoning": CategoricalDtype(categories=['None', 'A', 'C', 'FV', 'I', 'RH', 'RL', 'RP', 'RM']),
    "Street": CategoricalDtype(categories=['None', 'Grvl', 'Pave']),
    "Alley": CategoricalDtype(categories=['None', 'Grvl', 'Pave']),
    "LandContour": CategoricalDtype(categories=['None', 'Lvl', 'Bnk', 'HLS', 'Low']), # ordinal?
    "LotConfig": CategoricalDtype(categories=['None', 'Inside', 'Corner', 'CulDSac', 'FR2', 'FR3']), # ordinal?
    "Neighborhood": CategoricalDtype(categories=['None', 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'Names', 'NoRidge', 'NPkVill', 'NridgHt', 'NWAmes', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker']),
    "Condition1": CategoricalDtype(categories=['None', 'Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe']),
    "Condition2": CategoricalDtype(categories=['None', 'Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe']),
    "BldgType": CategoricalDtype(categories=['None', '1Fam', '2FmCon', 'Duplx', 'TwnhsE', 'TwnhsI']),
    "HouseStyle": CategoricalDtype(categories=['None', '1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl']),
    "RoofStyle": CategoricalDtype(categories=['None', 'Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed']),
    "RoofMatl": CategoricalDtype(categories=['None', 'ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl']),
    "Exterior1st": CategoricalDtype(categories=['None', 'AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']),
    "Exterior2nd": CategoricalDtype(categories=['None', 'AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']),
    "MasVnrType": CategoricalDtype(categories=['None', 'BrkCmn', 'BrkFace', 'CBlock', 'Stone']),
    "Foundation": CategoricalDtype(categories=['None', 'BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood']),
    "Heating": CategoricalDtype(categories=['None', 'Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall']),
    "GarageType": CategoricalDtype(categories=['None', '2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd']),
    "MiscFeature": CategoricalDtype(categories=['None', 'Elev', 'Gar2', 'Othr', 'Shed', 'TenC']),
    "SaleType": CategoricalDtype(categories=['None', 'WD', 'CWD', 'VWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI', 'ConLD', 'Oth']),
    "SaleCondition": CategoricalDtype(categories=['None', 'Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial'])
}

# The ordinal (ordered) categorical features 
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1,11))
ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj2", "Maj1", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["FuseP", "FuseF", "Mix", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Add None level for missing values
ordered_levels = {key: ['None'] + value for key, value in ordered_levels.items()}

And now we can convert the categorical columns to their respective categorical types.

In [None]:
def encode(df):
    # Nominal categories
    for col, cats in nominal_categories.items():
        df[col] = df[col].astype(cats)
    # Ordinal categories
    for col, levels in ordered_levels.items():
        df[col] = df[col].astype(CategoricalDtype(levels, ordered=True))
    return df

df = encode(df)

## Imputation
Let's first look at how many missing values we have and the columns where they're located.

In [None]:
nan_by_col = df.isnull().sum().drop(labels='SalePrice')
cols_with_nan = nan_by_col[nan_by_col > 0]
print(f'There are {len(cols_with_nan)} columns with missing values:\n{cols_with_nan.sort_values(ascending=False)}')
print(f'Total number of missing entries: {nan_by_col.sum()}')

We will impute with the TA/Typical/Average/Normal level whenever the column has that level but lacks a *NA/None* level.

In [None]:
cols_with_typical = ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual']
df[cols_with_typical] = df[cols_with_typical].fillna('TA')
df['Functional'] = df['Functional'].fillna('Typ')
df['Electrical'] = df['Electrical'].fillna('FuseA')

All the values of the *Utilities* column are equal to *AllPub* except one. So we'll impute with that.

In [None]:
print(df['Utilities'].value_counts())
df['Utilities'] = df['Utilities'].fillna('AllPub')

For the rest of the categorical columns, we'll just impute with *\"None\"*.

In [None]:
categorical_columns = df.select_dtypes('category').columns
df[categorical_columns] = df[categorical_columns].fillna('None')

All that's left are numerical columns. We will impute 0 for these.

In [None]:
# We have to remove the target column (SalePrice) from the list of numerical columns before imputing
numerical_columns = df.select_dtypes('number').columns.drop('SalePrice')
df[numerical_columns] = df[numerical_columns].fillna(0)

All of these steps are summarized in this function.

In [None]:
def impute(df):
    nan_by_col = df.isnull().sum().drop(labels='SalePrice')
    cols_with_nan = nan_by_col[nan_by_col > 0]

    cols_with_typical = ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual']
    df[cols_with_typical] = df[cols_with_typical].fillna('TA')
    df['Functional'] = df['Functional'].fillna('Typ')
    df['Electrical'] = df['Electrical'].fillna('FuseA')

    df['Utilities'] = df['Utilities'].fillna('AllPub')

    categorical_columns = df.select_dtypes('category').columns
    df[categorical_columns] = df[categorical_columns].fillna('None')

    numerical_columns = df.select_dtypes('number').columns.drop('SalePrice')
    df[numerical_columns] = df[numerical_columns].fillna(0)
    return df

In [None]:
def load_data():
    # Read data
    df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
    df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])
    # Preprocessing
    df = clean(df)
    df = encode(df)
    df = impute(df)
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test

# Establish a Baseline Score

In [None]:
df_train, df_test = load_data()

In [None]:
def score_dataset(X, y, model=XGBRegressor(), metric='MAE'):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    if (metric == 'RMSLE'):
        y = np.log(y)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    if (metric == 'RMSLE'):
        score = np.sqrt(score)
    return score

In [None]:
X_train = df_train.copy()
y_train = X_train.pop("SalePrice")

baseline_score = score_dataset(X_train, y_train, metric=metric)
print(f"Baseline score: {baseline_score:.5f} {metric}")

# Feature Selection

To find out which features are more useful for predicting the target variable, we calculate the MI (Mutual Information) score for each one of them.

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
X_train = df_train.copy()
y_train = X_train.pop("SalePrice")

mi_scores = make_mi_scores(X_train, y_train)
mi_scores.round(3)

This function lets us drop any features with an MI score of 0.

In [None]:
def drop_uninformative(df, mi_scores):
    return df.loc[:, mi_scores > 0.0]

In [None]:
X_train = df_train.copy()
y_train = X_train.pop("SalePrice")
X_train = drop_uninformative(X_train, mi_scores)
score_dataset(X_train, y_train, metric=metric)

# Feature Engineering

In [None]:
X_train = df_train.copy()
y_train = X_train.pop("SalePrice")

In [None]:
def label_encode(df):
    X = df.copy()
    for col in X.select_dtypes(["category"]):
        X[col] = X[col].cat.codes
    return X

In [None]:
def mathematical_transforms(df):
    X = pd.DataFrame()
    X["LivLotRatio"] = df['GrLivArea'] / df['LotArea']
    X["Spaciousness"] = (df['1stFlrSF'] + df['2ndFlrSF']) / df['TotRmsAbvGrd']
    return X

In [None]:
def interactions(df):
    X = pd.get_dummies(df.BldgType, prefix="Bldg")
    X = X.mul(df.GrLivArea, axis=0)
    return X

In [None]:
def counts(df):
    X = pd.DataFrame()
    X["PorchTypes"] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "3SsnPorch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)
    return X

In [None]:
def group_transforms(df):
    X = pd.DataFrame()
    X["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
    return X

In [None]:
def pca_inspired(df):
    X = pd.DataFrame()
    X["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    X["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF
    return X

In [None]:
class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

## Create Final Feature Set

In [None]:
def create_features(df_train, df_test=None):
    X_train = df_train.copy()
    y_train = X_train.pop("SalePrice")
    mi_scores = make_mi_scores(X_train, y_train)
    # Combine splits if test data is given
    #
    # If we're creating features for test set predictions, we should
    # use all the data we have available. After creating our features,
    # we'll recreate the splits.
    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop("SalePrice")
        X_train = pd.concat([X_train, X_test])

    X_train = drop_uninformative(X_train, mi_scores)
    X_train = X_train.join(mathematical_transforms(X_train))
    #X_train = X_train.join(interactions(X_train))
    X_train = X_train.join(counts(X_train))
    X_train = X_train.join(group_transforms(X_train))
    X_train = X_train.join(pca_inspired(X_train))
    X_train = label_encode(X_train)

    # Reform splits
    if df_test is not None:
        X_test = X_train.loc[df_test.index, :]
        X_train.drop(df_test.index, inplace=True)

    # Target encoding
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    X_train = X_train.join(encoder.fit_transform(X_train, y_train, cols=["MSSubClass"]))
    if df_test is not None:
        X_test = X_test.join(encoder.transform(X_test))
        return X_train, X_test
    else:
        return X_train

df_train, df_test = load_data()
X_train = create_features(df_train)
y_train = df_train["SalePrice"]
score_dataset(X_train, y_train, metric=metric)

# Hyperparameter Tuning

We will use optuna to optimize our XGBoost regressor's hyperparameters.

In [None]:
X_train = create_features(df_train)
y_train = df_train["SalePrice"]

xgb_params = dict(
    max_depth=6,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.01,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=1000,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.7,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.7,         # fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=0.5,         # L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=1.0,        # L2 regularization (like Ridge) - try 0.0 to 10.0
    num_parallel_tree=1,   # set > 1 for boosted random forests
)

xgb = XGBRegressor(**xgb_params)
score_dataset(X_train, y_train, xgb, metric=metric)

In [None]:
def objective(trial):
    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 20),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
    )
    xgb = XGBRegressor(**xgb_params)
    return score_dataset(X_train, y_train, xgb, metric=metric)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)
xgb_params = study.best_params

# Train Model and Create Submission

In [None]:
X_train, X_test = create_features(df_train, df_test)
y_train = df_train["SalePrice"]

xgb = XGBRegressor(**xgb_params)
# XGB minimizes MSE, but competition loss is RMSLE
# So, we need to log-transform y to train and exp-transform the predictions
xgb.fit(X_train, np.log(y_train))
predictions = np.exp(xgb.predict(X_test))
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)