In [1]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?")
ames_housing = ames_housing.drop(columns="Id")

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [4]:
data.head()


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [2]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [3]:
data[numerical_columns].head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,2008
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,2007
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,2008
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,2006
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,2008


In [4]:
numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [5]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer, MissingIndicator

model = make_pipeline(
    make_union(SimpleImputer(strategy="mean"),
                MissingIndicator()),StandardScaler(),
    LogisticRegression(max_iter=500))

cv_result0 = cross_validate(model, data[numerical_features], target, cv=5)

In [6]:
cv_result0

{'fit_time': array([0.01204896, 0.01158977, 0.01072764, 0.0111115 , 0.01188731]),
 'score_time': array([0.00305557, 0.00298762, 0.00297427, 0.00296187, 0.00310659]),
 'test_score': array([0.89726027, 0.90753425, 0.90068493, 0.86643836, 0.90410959])}

In [7]:
scores0 = cv_result0["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores0.mean():.3f} +/- {scores0.std():.3f}")

The mean cross-validation accuracy is: 0.895 +/- 0.015


In [11]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

scaler_imputer_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore'))
numerical_imputer = make_pipeline(
    make_union(SimpleImputer(missing_values=np.nan,strategy="mean"),
                MissingIndicator(missing_values=np.nan)),StandardScaler())
    
preprocessor = ColumnTransformer([
    ("cat", scaler_imputer_transformer, data[categorical_columns]),
     ('num', numerical_imputer, data[numerical_features])
])
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
    

In [13]:
num_transformer = make_pipeline(make_union(SimpleImputer(missing_values=np.nan,strategy="mean"),
                MissingIndicator(missing_values=np.nan)), StandardScaler())    

cat_transformer = make_pipeline( SimpleImputer(strategy = 'most_frequent'), OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer(transformers=[
    ("num-preprocessor", num_transformer, numerical_features),
    ('cat-preprocessor',cat_transformer, categorical_columns)
])
model2 = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

In [14]:
cv_result = cross_validate(model2, data, target, cv=5)
cv_result

{'fit_time': array([0.11260724, 0.1158967 , 0.14876199, 0.12924623, 0.11455417]),
 'score_time': array([0.01083231, 0.01075196, 0.01074409, 0.0108006 , 0.01089644]),
 'test_score': array([0.92465753, 0.92123288, 0.93493151, 0.87328767, 0.93493151])}

In [15]:
scores1 = cv_result["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores1.mean():.3f} +/- {scores1.std():.3f}")

The mean cross-validation accuracy is: 0.918 +/- 0.023


In [31]:
print(scores1.mean() - scores0.mean()) 

0.022602739726027443
