# Install & Import

In [1]:
! pip install catboost --quiet

In [2]:
from catboost import CatBoostRegressor
from datasets import load_dataset
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Utilities

In [3]:
def is_numeric(x):
    try:
        x + 1
        return True
    except:
        return False


def beautify_int(x):
    if type(x) in [bool, np.bool]:
        return str(x)
    if x is None or np.isnan(x):
        return ""
    try:
        return f"{int(x):,.0f}"
    except:
        return str(x)

# Example

In [4]:
df_example = pd.DataFrame({
    "SquareFeet (Covariate)": [600, 600, 1_700, 1_700],
    "OverallCondition (Decision)": [4, 7, 5, 8],
    "SalePrice (Target)": [88_000, 95_000, 247_000, 271_000],
    "ModelPrediction": [90_000, 90_000, 260_000, 260_000],
})

In [5]:
df_example.map(beautify_int)

Unnamed: 0,SquareFeet (Covariate),OverallCondition (Decision),SalePrice (Target),ModelPrediction
0,600,4,88000,90000
1,600,7,95000,90000
2,1700,5,247000,260000
3,1700,8,271000,260000


In [6]:
print(f"MAE No-Skill Model: {mean_absolute_error(df_example['SalePrice (Target)'], [df_example['SalePrice (Target)'].mean()] * len(df_example)):,.0f}")
print(f"MAE Model: {mean_absolute_error(df_example['SalePrice (Target)'], df_example['ModelPrediction']):,.0f}")

MAE No-Skill Model: 83,750
MAE Model: 7,750


# Read and prepare data

In [7]:
df = load_dataset("ttd22/house-price")["train"].to_pandas().sample(frac=1, replace=False)
df.columns = [("HouseId" if c == "Id" else c) for c in df.columns]
cat_columns = list(df.columns[~df.apply(is_numeric)])
df[cat_columns] = df[cat_columns].fillna("void")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


train.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

# Train models and save predictions

- Variable names

In [8]:
target = "SalePrice"

decision_variables = [
    "OverallCond",
]

monotone_constraints = {
    "OverallCond": 1,
}

covariates = [
    "LotArea",
    "MasVnrArea",
    "GrLivArea",
    "GarageArea",
    "PoolArea",
    "1stFlrSF",
    "2ndFlrSF",
    "TotalBsmtSF",
    "Foundation",
    "YearBuilt",
    "Neighborhood",
]

full_model_prediction = "FullModel"

covariate_model_prediction = "CovariateModel"

full_model_cat_features = [c for c in cat_columns if c in covariates + decision_variables]

covariate_model_cat_features = [c for c in cat_columns if c in covariates]

In [9]:
%%time

full_models = []
covariate_models = []

# Because we don't have many data points, we train 10 models with a 80/20 split
for ix_model in tqdm(range(10)):

    ix_trn, ix_tst = train_test_split(df.index, test_size=.20)

    full_model = CatBoostRegressor(
        monotone_constraints=monotone_constraints,
        silent=True
    ).fit(
        X=df.loc[ix_trn, covariates + decision_variables],
        y=df.loc[ix_trn, target],
        cat_features=full_model_cat_features,
    )

    covariate_model = CatBoostRegressor(
        silent=True,
    ).fit(
        X=df.loc[ix_trn, covariates],
        y=df.loc[ix_trn, target],
        cat_features=covariate_model_cat_features,
    )

    full_models.append(full_model)
    covariate_models.append(covariate_model)

    df.loc[ix_tst, f"IsTest_{ix_model}"] = True
    df.loc[ix_tst, f"{full_model_prediction}_{ix_model}"] = full_model.predict(df.loc[ix_tst, full_model.feature_names_])
    df.loc[ix_tst, f"{covariate_model_prediction}_{ix_model}"] = covariate_model.predict(df.loc[ix_tst, covariate_model.feature_names_])

100%|██████████| 10/10 [02:16<00:00, 13.64s/it]

CPU times: user 3min 46s, sys: 14.3 s, total: 4min 1s
Wall time: 2min 16s





# Matching

In [10]:
%%time

# We set the maximum difference of two houses to be considered similar at $1k
max_dist = 1_000
pairs = pd.DataFrame(columns=pd.MultiIndex.from_tuples([("ModelId", "")]))
enum = 0


# Take 5,000 pairs
while enum < 5_000:

    # Pick model at random
    ix_model = np.random.choice(range(len(full_models)))
    covariate_model = covariate_models[ix_model]
    full_model = full_models[ix_model]

    df_tst = df.loc[df.loc[:, f"IsTest_{ix_model}"]==True, :].copy()

    # Pick one house at random from the test set
    first = np.random.choice(df_tst.index)

    # Find all the houses that are similar to the 1st house
    first_covariate_model_prediction = df_tst.loc[first, f"{covariate_model_prediction}_{ix_model}"]
    is_similar = (
        df_tst
        .drop(first)[f"{covariate_model_prediction}_{ix_model}"]
        .between(first_covariate_model_prediction-max_dist/2, first_covariate_model_prediction+max_dist/2)
    )

    # If there is not at least 1 similar house, skip
    if sum(is_similar) == 0:
        continue

    # Pick one house at random among the similar ones
    second = is_similar[is_similar].sample().index[0]

    # Save relevant information about the current pair
    pairs.loc[enum, ("ModelId", "")] = ix_model
    pairs.loc[enum, ("SameDecision", "")] = (df_tst.loc[first, decision_variables] == df_tst.loc[second, decision_variables]).all()

    for metric_name, metric in zip(
        ["HouseId", covariate_model_prediction, "Decision", full_model_prediction, "Actual"],
        ["HouseId", f"{covariate_model_prediction}_{ix_model}"] + decision_variables + [f"{full_model_prediction}_{ix_model}", target]
    ):
        for first_or_second_name, first_or_second in zip(["1st", "2nd"], [first, second]):
            pairs.loc[enum, (metric_name, first_or_second_name)] = df_tst.loc[first_or_second, :][metric]

    for prediction_name, prediction in zip(
        ["CovariateModel", "FullModel"],
        [f"{covariate_model_prediction}_{ix_model}", f"{full_model_prediction}_{ix_model}"]
    ):
        pairs.loc[enum, ("MeanAbsoluteError", prediction_name)] = mean_absolute_error(
            df_tst.loc[[first, second], :][target], df_tst.loc[[first, second], :][prediction]
        )

    enum += 1


# Beautify dataframe
pairs_display = pairs.map(beautify_int)

CPU times: user 1min, sys: 52.9 ms, total: 1min
Wall time: 1min


In [11]:
# For illustrative purpose, cherry-pick a single pair and pivot

eligible_pair = (
    (pairs.loc[:,("Actual","1st")] > 200_000)
    & (pairs.loc[:,("Decision","1st")] < pairs.loc[:,("Decision","2nd")])
    & (pairs.loc[:,("FullModel","1st")] < pairs.loc[:,("FullModel","2nd")])
    & (pairs.loc[:,("FullModel","1st")] < pairs.loc[:,("CovariateModel","1st")])
    & (pairs.loc[:,("FullModel","2nd")] > pairs.loc[:,("CovariateModel","2nd")])
    & (pairs.loc[:,("FullModel","1st")] > pairs.loc[:,("Actual","1st")])
    & (pairs.loc[:,("FullModel","2nd")] < pairs.loc[:,("Actual","2nd")])
    & (abs(pairs.loc[:,("FullModel","1st")] - pairs.loc[:,("Actual","1st")]) < (abs(pairs.loc[:,("CovariateModel","1st")] - pairs.loc[:,("Actual","1st")])))
    & (abs(pairs.loc[:,("FullModel","2nd")] - pairs.loc[:,("Actual","2nd")]) < (abs(pairs.loc[:,("CovariateModel","2nd")] - pairs.loc[:,("Actual","2nd")])))
    & (pairs.loc[:,("MeanAbsoluteError","FullModel")] <= pairs.loc[:,("MeanAbsoluteError","CovariateModel")] * .75)
)
pair_id = eligible_pair[eligible_pair].sample().index[0]
single_pair = pd.DataFrame(index=["1st", "2nd", "", "MeanAbsoluteError"])

for metric_name in ["HouseId", "CovariateModel", "Decision", "FullModel", "Actual"]:
    for first_or_second_name in ["1st", "2nd"]:
        single_pair.loc[first_or_second_name, metric_name] = pairs.loc[pair_id, (metric_name, first_or_second_name)]

    if metric_name in ["CovariateModel", "FullModel"]:
        single_pair.loc["MeanAbsoluteError", metric_name] = pairs.loc[pair_id, ("MeanAbsoluteError", metric_name)]

single_pair_display = single_pair.map(beautify_int)

# Display results

In [12]:
print(f"Number of data points: {len(df):,.0f}")
print(f"N samples in training: {df[f'IsTest_{ix_model}'].isna().sum():,.0f}")
print(f"N samples in test: {df[f'IsTest_{ix_model}'].sum():,.0f}")

Number of data points: 1,460
N samples in training: 1,168
N samples in test: 292


- Model performance

In [13]:
no_skill_model_maes = []
covariate_model_maes = []
full_model_maes = []

for ix_model in range(len(full_models)):
    df_tst = df.loc[df.loc[:, f"IsTest_{ix_model}"]==True, :].copy()
    no_skill_model_maes.append(mean_absolute_error(df_tst[target], [df_tst[target].mean()] * len(df_tst)))
    covariate_model_maes.append(mean_absolute_error(df_tst[target], df_tst[f"{covariate_model_prediction}_{ix_model}"]))
    full_model_maes.append(mean_absolute_error(df_tst[target], df_tst[f"{full_model_prediction}_{ix_model}"]))

no_skill_model_mae = np.mean(no_skill_model_maes)
covariate_model_mae = np.mean(covariate_model_maes)
full_model_mae = np.mean(full_model_maes)

print(f"MAE No-Skill Model: {no_skill_model_mae:,.0f}")
print(f"MAE Covariate Model: {covariate_model_mae:,.0f}")
print(f"MAE Full Model: {full_model_mae:,.0f}")
print(f"Gain in MAE: {full_model_mae / covariate_model_mae - 1:.0%}")

MAE No-Skill Model: 58,810
MAE Covariate Model: 20,381
MAE Full Model: 18,595
Gain in MAE: -9%


- Feature importance (CatBoost default method)

In [14]:
feature_importances = sum([pd.Series(full_model.feature_importances_, index=full_model.feature_names_) for full_model in full_models]) / len(full_models)
feature_importances.sort_values(ascending=False).apply(lambda x: f"{x:.2f}%").rename("FeatureImportance")

Unnamed: 0,FeatureImportance
GrLivArea,21.54%
YearBuilt,14.56%
TotalBsmtSF,13.63%
Neighborhood,10.38%
1stFlrSF,8.75%
GarageArea,7.84%
LotArea,7.13%
Foundation,4.69%
2ndFlrSF,4.68%
OverallCond,3.64%


- Matching - Show single pair

In [15]:
single_pair_display

Unnamed: 0,HouseId,CovariateModel,Decision,FullModel,Actual
1st,291.0,241964.0,5.0,240285.0,233230.0
2nd,674.0,241584.0,7.0,248940.0,257500.0
,,,,,
MeanAbsoluteError,,12324.0,,7807.0,


- Matching - Show some pairs

In [16]:
pairs_display.head().drop([("SameDecision", ""), ("ModelId", "")], axis=1)

Unnamed: 0_level_0,HouseId,HouseId,CovariateModel,CovariateModel,Decision,Decision,FullModel,FullModel,Actual,Actual,MeanAbsoluteError,MeanAbsoluteError
Unnamed: 0_level_1,1st,2nd,1st,2nd,1st,2nd,1st,2nd,1st,2nd,CovariateModel,FullModel
0,691,74,145426,145516,5,7,143323,148822,141000,144900,2521,3123
1,1077,1207,134186,134582,8,4,141103,117041,170000,107000,31698,19469
2,617,329,176939,176669,5,6,175694,174403,183200,214500,22045,23801
3,154,1000,179373,178998,7,5,186304,178719,235000,206000,41314,37987
4,1266,51,172947,172938,5,6,167827,188984,183900,177000,7507,14028


- Matching - Summary

In [17]:
is_duplicated_pair = pairs.loc[:, [("ModelId", ""), ("HouseId","1st"), ("HouseId","2nd")]].duplicated()
print(f"Number of pairs: {len(pairs):,.0f}")
print(f"Number of duplicated pairs: {sum(is_duplicated_pair):,.0f}")
print(f"Number of non-duplicated pairs: {len(pairs) - sum(is_duplicated_pair):,.0f}")

Number of pairs: 5,000
Number of duplicated pairs: 2,320
Number of non-duplicated pairs: 2,680


In [18]:
matching = pairs.groupby(("SameDecision", "")).apply(
    lambda d: pd.Series(
        {
            "Number of pairs": f'{len(d):,.0f}',
            "MAE Covariate Model": f'{d.loc[:, ("MeanAbsoluteError","CovariateModel")].mean():,.0f}',
            "MAE Full Model": f'{d.loc[:, ("MeanAbsoluteError","FullModel")].mean():,.0f}',
            "Gain in MAE": f'{
                d.loc[:, ("MeanAbsoluteError","FullModel")].mean() / d.loc[:, ("MeanAbsoluteError","CovariateModel")].mean() - 1:.0%}',
        }
    ),
    include_groups=False,
).rename({True: "Same decision", False: "Different decision"}).sort_index(ascending=False)
matching.index.name = None
matching

Unnamed: 0,Number of pairs,MAE Covariate Model,MAE Full Model,Gain in MAE
Same decision,1888,16220,15882,-2%
Different decision,3112,16927,14515,-14%
