In [1]:
import pandas as pd
import numpy as np

In [2]:
from pandas_profiling import ProfileReport

In [3]:
df = pd.read_csv('train.csv')

In [4]:
pd.set_option('display.max_rows', 100)

## Separate features by type

In [5]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Select categorical features

In [6]:
cols_cat = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
        'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
        'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 
        'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'MiscFeature', 'SaleType', 'SaleCondition']

In [7]:
df[cols_cat] = df[cols_cat].astype('string')

### Select numeric features

In [8]:
df.select_dtypes(exclude = ['object', 'string']).columns

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [9]:
cols_num = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice']

In [10]:
type(cols_num)

list

### Select ordinal categories (remove Id)

In [11]:
cols_ord = [col for col in df.columns if col not in cols_cat + cols_num]

In [12]:
cols_ord.remove("Id")

In [13]:
df[cols_ord] = df[cols_ord].astype('string')

In [14]:
cols_ord

['ExterQual',
 'ExterCond',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence']

## Impute NaNs

In [15]:
df_nans = df.isna().sum().reset_index().rename(columns={0:'num_nans', 'index':'col_names'})

In [16]:
cols_with_nans = df_nans[df_nans.num_nans != 0].col_names.to_list()

In [17]:
df[cols_with_nans].dtypes

LotFrontage     float64
Alley            string
MasVnrType       string
MasVnrArea      float64
BsmtQual         string
BsmtCond         string
BsmtExposure     string
BsmtFinType1     string
BsmtFinType2     string
Electrical       string
FireplaceQu      string
GarageType       string
GarageYrBlt     float64
GarageFinish     string
GarageQual       string
GarageCond       string
PoolQC           string
Fence            string
MiscFeature      string
dtype: object

In [18]:
cols_cat_with_nans = list(set(cols_with_nans) & set(cols_cat))

In [19]:
cols_ord_with_nans = list(set(cols_with_nans) & set(cols_ord))

In [20]:
cols_num_with_nans = list(set(cols_with_nans) & set(cols_num))

 ### Fill missing values in categorical features

In [21]:
df[cols_cat_with_nans] = df[cols_cat_with_nans].fillna('NoInfo')

### Fill missing values in GarageYrBlt with YearBuilt. The missing values are from variables with no garage taken into account in other feature.

In [22]:
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])

### Fill missing values in LotFrontage by using LotArea

In [23]:
df['LotFrontage'] = df['LotFrontage'].fillna(np.sqrt(df["LotArea"])/np.mean(np.sqrt(df["LotArea"])/(df["LotFrontage"])))

### Fill missing values in ordinal features with NoInfo

In [24]:
df[cols_ord_with_nans] = df[cols_ord_with_nans].fillna('NoInfo')

### Eliminate the rows with remaining NaNs

In [25]:
df = df.dropna(axis=0, how='any')

## Encode Categorical features with dummy variables

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
enc_df = pd.get_dummies(df, columns=cols_cat, prefix=cols_cat, drop_first=True)

## Encode ordinal categories
### Create train and test subsets for Ordinal Encoding

In [28]:
data = df[cols_ord+['SalePrice']].values
x_enc = df[cols_ord]
y_enc = df['SalePrice']
print('Input', x_enc.shape)
print('Output', y_enc.shape)

Input (1452, 13)
Output (1452,)


### Fit and encode

In [29]:
map_ord = [['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'No', 'Mn', 'Av', 'Gd'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Unf', 'RFn', 'Fin'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
           ['NoInfo', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']]           

In [30]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
ordinal_enc = OrdinalEncoder(categories = map_ord)
ordinal_enc.fit(x_enc)

In [31]:
ordinal_enc.categories_

[array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'No', 'Mn', 'Av', 'Gd'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Unf', 'RFn', 'Fin'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NoInfo', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'], dtype=object)]

In [32]:
x_enc = ordinal_enc.transform(x_enc)

In [33]:
enc_df[cols_ord] = x_enc

In [34]:
y_enc = np.log(1+enc_df['SalePrice'])

In [35]:
print(y_enc, enc_df['SalePrice'])

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1455    12.072547
1456    12.254868
1457    12.493133
1458    11.864469
1459    11.901590
Name: SalePrice, Length: 1452, dtype: float64 0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1452, dtype: int64


## Separate train and test subsets(and transform target variable)

In [36]:
from sklearn.model_selection import train_test_split
data = enc_df.to_numpy()
Id = enc_df.pop('Id')
y = np.log(1+enc_df.pop('SalePrice'))
X = enc_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

In [37]:
pd.set_option('display.max_rows', 500)
X.shape

(1452, 235)

# Feature Selection with RandomForestRegressor and Recursive Feature Elimination

In [38]:
%%time
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
clf = RandomForestRegressor(n_estimators = 100, random_state=42)
sel = RFE(estimator=clf, n_features_to_select=0.1, step=1)
sel.fit(X_train, y_train)

CPU times: user 12min 1s, sys: 602 ms, total: 12min 1s
Wall time: 12min 1s


In [43]:
selected_feat= X_train.columns[(sel.get_support())]
len(selected_feat)

23

In [44]:
selected_feat

Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'BsmtQual', 'BsmtFinSF1',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'KitchenQual', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'OpenPorchSF', 'MoSold', 'GarageType_Detchd'],
      dtype='object')

In [45]:
sel_train = X_train[selected_feat]
sel_test = X_test[selected_feat]

# Train Regression Models with Selected Features

### LinearRegression

In [46]:
from sklearn.linear_model import LinearRegression
linreg=LinearRegression()
linreg.fit(sel_train,y_train)
y_linreg=linreg.predict(sel_test)

### Ridge Regression

In [47]:
from sklearn.linear_model import Ridge
ridgereg=Ridge()
ridgereg.fit(sel_train,y_train)
y_ridge=ridgereg.predict(sel_test)

### Lasso Regression

In [49]:
from sklearn.linear_model import Lasso
lassoreg=Lasso()
lassoreg.fit(sel_train,y_train)
y_lasso=lassoreg.predict(sel_test)

### Random Forest Regression

In [50]:
from sklearn.ensemble import RandomForestRegressor
rfreg=RandomForestRegressor()
rfreg.fit(sel_train,y_train)
y_rf=rfreg.predict(sel_test)

# Model Evaluation with MSE and $R^2$

In [53]:
from sklearn.metrics import mean_squared_error, r2_score
linmse=mean_squared_error(y_test,y_linreg)
ridgemse=mean_squared_error(y_test,y_ridge)
lassomse=mean_squared_error(y_test, y_lasso)
rfmse=mean_squared_error(y_test,y_rf)
linr2=r2_score(y_test,y_linreg)
ridger2=r2_score(y_test,y_ridge)
lassor2=r2_score(y_test,y_lasso)
rfr2=r2_score(y_test,y_rf)
print('Linear Regression: MSE= ', linmse,'  | R2= ',linr2)
print('Ridge Regression:  MSE= ', ridgemse, ' | R2= ', ridger2)
print('Lasso Regression:  MSE= ', lassomse, ' | R2= ', lassor2)
print('Random Forest Reg: MSE= ', rfmse, ' | R2= ', rfr2)

Linear Regression: MSE=  0.02721808514074553   | R2=  0.8329168422123578
Ridge Regression:  MSE=  0.027194052952456582  | R2=  0.8330643681638383
Lasso Regression:  MSE=  0.035627424670156733  | R2=  0.7812945845767869
Random Forest Reg: MSE=  0.022976741132817245  | R2=  0.8589531025313283


## Probando

In [None]:
prof = ProfileReport(df)

In [None]:
#prof.to_file(output_file='profilehousetrain.html')