In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

train.iloc[:, 72:] # Total 81 columns 

Unnamed: 0,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,,,,0,2,2008,WD,Normal,208500
1,,,,0,5,2007,WD,Normal,181500
2,,,,0,9,2008,WD,Normal,223500
3,,,,0,2,2006,WD,Abnorml,140000
4,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...
1455,,,,0,8,2007,WD,Normal,175000
1456,,MnPrv,,0,2,2010,WD,Normal,210000
1457,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,,,,0,4,2010,WD,Normal,142125


In [4]:
from sklearn.model_selection import train_test_split

X = train.drop('SalePrice', axis=1)
y = train.SalePrice

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.25, random_state=1121)

In [5]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))

# Drop categorical columns that will not be encoded
X_train = X_train.drop(bad_label_cols, axis=1)
X_valid = X_valid.drop(bad_label_cols, axis=1)

In [7]:
X_train.describe().T.iloc[:11]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1095.0,739.581735,412.416975,1.0,392.5,739.0,1094.5,1460.0
MSSubClass,1095.0,56.630137,42.147725,20.0,20.0,50.0,70.0,190.0
LotFrontage,906.0,70.100442,24.104099,21.0,59.0,70.0,80.0,313.0
LotArea,1095.0,10574.406393,10951.131951,1300.0,7447.5,9452.0,11489.0,215245.0
OverallQual,1095.0,6.085845,1.364853,1.0,5.0,6.0,7.0,10.0
OverallCond,1095.0,5.578082,1.123569,1.0,5.0,5.0,6.0,9.0
YearBuilt,1095.0,1971.548858,30.163163,1872.0,1954.0,1973.0,2001.0,2010.0
YearRemodAdd,1095.0,1985.078539,20.459244,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1087.0,101.528059,174.826509,0.0,0.0,0.0,163.0,1378.0
BsmtFinSF1,1095.0,430.004566,428.544331,0.0,0.0,378.0,694.5,2260.0


In [9]:
missing = X_train.isnull().sum() > 0

In [10]:
X_train.isnull().sum()[missing]

LotFrontage      189
Alley           1037
MasVnrArea         8
BsmtQual          28
BsmtExposure      29
BsmtFinType1      28
BsmtFinType2      29
FireplaceQu      517
GarageType        56
GarageYrBlt       56
GarageFinish      56
Fence            884
dtype: int64

In [14]:
#X_train.isna().any()
nan_cols = [i for i in X_train.columns if X_train[i].isnull().any()]
print (nan_cols) 

['LotFrontage', 'Alley', 'MasVnrArea', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'Fence']


In [15]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

There are 37 numerical features: 

['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [16]:
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
print(f'There are {len(categorical_features)} categorical features:', '\n')
print(categorical_features)

There are 24 categorical features: 

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofStyle', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 'PavedDrive', 'Fence']


Now, on to preprocessing. For numeric columns, we first fill the missing values with SimpleImputer using the mean and feature scale using MinMaxScaler. For categoricals, we will use SimpleImputer to fill the missing values with the mode of each column. Most importantly, we do all of these in a pipeline. Let's import everything:

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

In [18]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

By default, all Pipeline objects have fit and transform methods which can be used to transform the input array like below

In [19]:
from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [20]:
full_processor.fit_transform(X_train)

array([[0.12816998, 0.17647059, 0.13356164, ..., 0.        , 1.        ,
        0.        ],
       [0.16518163, 0.05882353, 0.06506849, ..., 0.        , 1.        ,
        0.        ],
       [0.45579164, 0.23529412, 0.29109589, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.36257711, 0.        , 0.1681522 , ..., 0.        , 1.        ,
        0.        ],
       [0.42015079, 0.        , 0.16780822, ..., 0.        , 1.        ,
        0.        ],
       [0.51953393, 0.82352941, 0.01027397, ..., 0.        , 1.        ,
        0.        ]])

Combine preprocessing and modeling (estimator). Order is crucial! 

In [22]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

lasso = Lasso(alpha=0.1)

lasso_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

In [37]:
_ = lasso_pipeline.fit(X_train, y_train)

In [24]:
preds = lasso_pipeline.predict(X_valid)
mean_absolute_error(y_valid, preds)

19775.341796114233

In [26]:
lasso_pipeline.score(X_valid, y_valid)

0.79485193538996

We will now do grid search. The main hyperparameter for Lasso is \alpha. We will do range search 
restricting between [0,1] steps of 0.05. We will also use cv=10 which specifies the number of folds in a (Stratified) KFold. Default is 5. 

In [27]:
from sklearn.model_selection import GridSearchCV

param_dict = {'model__alpha': np.arange(0, 1, 0.05)}

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

_ = search.fit(X_train, y_train)

In [28]:
print('Best score:', abs(search.best_score_))

Best score: 18124.680421519995


In [29]:
print('Best alpha:', search.best_params_)

Best alpha: {'model__alpha': 0.9500000000000001}


The best \alpha is cutting close to the end of the interval, we will now redefine the interval 
to be [1, 201] in steps of 5 

In [33]:
param_dict = {'model__alpha': np.arange(1, 201, 5)}

search = GridSearchCV(lasso_pipeline, param_dict, 
                      cv=10, 
                      scoring='neg_mean_absolute_error')

_ = search.fit(X_train, y_train)

In [35]:
print('Best score:', abs(search.best_score_))

Best score: 17143.590547138254


In [36]:
print('Best alpha:', search.best_params_)

Best alpha: {'model__alpha': 131}


In [38]:
lasso = Lasso(alpha=131)

lasso_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', lasso)
])

In [41]:
_ = lasso_pipeline.fit(X_train, y_train)
preds = lasso_pipeline.predict(X_valid)

mean_absolute_error(y_valid, preds)

18690.487087904432

In [45]:
y_valid.shape, preds.shape

((365,), (365,))

In [48]:
d = d = {'Y_valid':y_valid,'Predicted':preds}
pd.DataFrame(d)

Unnamed: 0,Y_valid,Predicted
119,163990,177707.822660
938,239799,233882.295253
1124,163900,167949.963028
1255,127500,133213.399031
748,260400,276708.660981
...,...,...
39,82000,63527.777452
111,180000,170632.159697
623,168500,179170.100771
1240,224900,203388.842042
