In [1]:
import pandas as pd
ames_housing = pd.read_csv("../../../datasets/ames_housing_no_missing.csv")

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

In [4]:
data.dtypes.value_counts()

object     43
int64      33
float64     3
dtype: int64

In [47]:
numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
import time

preprocessor = ColumnTransformer([
    ('numerical', StandardScaler(), numerical_features)])

model = make_pipeline(preprocessor, LogisticRegression())

start = time.time()
cv_results = cross_validate(model, data, target, cv=10)
elapsed_time = time.time() - start

scores = cv_results["test_score"]

print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f} "
      f"with a fitting time of {elapsed_time:.3f}")

The mean cross-validation accuracy is: 0.892 ± 0.013 with a fitting time of 0.142


In [52]:
# With categorical columns
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer([
    ('numerical', StandardScaler(), numerical_features), ('categorical', OneHotEncoder(handle_unknown='ignore'), data.select_dtypes(object).columns)])

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

start = time.time()
cv_results = cross_validate(model, data, target, cv=10)
elapsed_time = time.time() - start

scores1 = cv_results["test_score"]

print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} ± {scores.std():.3f} "
      f"with a fitting time of {elapsed_time:.3f}")
print("The model using all features is performing better "
    f"{sum(scores < scores1)} "
  "times out of 10 than the model using only numerical features.")

The mean cross-validation accuracy is: 0.892 ± 0.013 with a fitting time of 1.005
The model using all features is performing better 7 times out of 10 than the model using only numerical features.
