# Wrap Up Quiz


In [1]:
import pandas as pd
import numpy as np
# Disable jedi autocompleter
%config Completer.use_jedi = False

In [2]:
df = pd.read_csv("data/house_prices.csv")
df.sample(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1344,1345,60,RL,85.0,11103,Pave,?,IR1,Lvl,AllPub,...,0,?,?,?,0,7,2007,New,Partial,155835
234,235,60,RL,?,7851,Pave,?,Reg,Lvl,AllPub,...,0,?,?,?,0,5,2010,WD,Normal,216500
801,802,30,RM,40.0,4800,Pave,?,Reg,Lvl,AllPub,...,0,?,?,?,0,7,2007,WD,Normal,109900
246,247,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,...,0,?,?,?,0,4,2006,WD,Normal,137000
468,469,20,RL,98.0,11428,Pave,?,IR1,Lvl,AllPub,...,0,?,?,?,0,5,2007,WD,Normal,250000


In [3]:
df = df.drop(columns='Id')
target = df['SalePrice']
data = df.drop(columns='SalePrice')


The column `SalePrice` contains the target variable. Note that we instructed pandas to treat the character '?' as a marker for cells with missing values.

Furthermore, we ignore the column named "`Id`" because unique identifiers are usually useless in the context of predictive modeling.

We did not encounter any regression problem yet. Therefore, we will convert the regression target into a classification target to predict whether or not an house is expensive. "Expensive isdefined as a sale price greater than $200,000.


In [4]:
target = (target > 200_000).astype(int)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1460 non-null   int64 
 1   MSZoning       1460 non-null   object
 2   LotFrontage    1460 non-null   object
 3   LotArea        1460 non-null   int64 
 4   Street         1460 non-null   object
 5   Alley          1460 non-null   object
 6   LotShape       1460 non-null   object
 7   LandContour    1460 non-null   object
 8   Utilities      1460 non-null   object
 9   LotConfig      1460 non-null   object
 10  LandSlope      1460 non-null   object
 11  Neighborhood   1460 non-null   object
 12  Condition1     1460 non-null   object
 13  Condition2     1460 non-null   object
 14  BldgType       1460 non-null   object
 15  HouseStyle     1460 non-null   object
 16  OverallQual    1460 non-null   int64 
 17  OverallCond    1460 non-null   int64 
 18  YearBuilt      1460 non-null

In [6]:
df['SalePrice'].value_counts()

140000    20
135000    17
145000    14
155000    14
190000    13
          ..
84900      1
424870     1
415298     1
62383      1
34900      1
Name: SalePrice, Length: 663, dtype: int64

In [7]:
# Number of features available to predict whether or not a house is expensive
len(data.columns)

79

In [8]:
# numerical and categorical features
df.dtypes.value_counts()

object    46
int64     34
dtype: int64

We consider the following numerical columns:

In [9]:
numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [10]:
data[numerical_features].dtypes

LotFrontage      object
LotArea           int64
MasVnrArea       object
BsmtFinSF1        int64
BsmtFinSF2        int64
BsmtUnfSF         int64
TotalBsmtSF       int64
1stFlrSF          int64
2ndFlrSF          int64
LowQualFinSF      int64
GrLivArea         int64
BedroomAbvGr      int64
KitchenAbvGr      int64
TotRmsAbvGrd      int64
Fireplaces        int64
GarageCars        int64
GarageArea        int64
WoodDeckSF        int64
OpenPorchSF       int64
EnclosedPorch     int64
3SsnPorch         int64
ScreenPorch       int64
PoolArea          int64
MiscVal           int64
dtype: object

As we can see from above, two of the numerical features; `LotFrontage` and `MasVnrArea` are actually objects. To convert them to numericals we will need first to replace string values to int or float:

In [11]:
data['LotFrontage'] = data['LotFrontage'].replace('?', np.nan).astype(float)

In [12]:
data['MasVnrArea'] = data['MasVnrArea'].replace('?', np.nan).astype(float)

In [13]:
data_numerical = data[numerical_features]
data_numerical.dtypes

LotFrontage      float64
LotArea            int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
dtype: object

Now create a predictive model that uses these numerical columns as input data. Your predictive model should be a pipeline composed of a standard scaler, a mean imputer (cf. `sklearn.impute.SimpleImputer(strategy="mean")`) and a `sklearn.linear_model.LogisticRegression`.

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [18]:
model = make_pipeline(StandardScaler(), 
                      SimpleImputer(missing_values=np.nan, strategy='mean'), 
                      LogisticRegression())

In [19]:
# Cross-validation

from sklearn.model_selection import cross_validate

res = cross_validate(model, data_numerical, target, cv=5)

scores = res['test_score']
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.895 +/- 0.011
