# House Price Prediction with scikit-learn


In [1]:
import pandas as pd
import numpy as np

import os

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline

from sklearn.linear_model import Ridge

## Dataset

In [2]:
# get train and test data set
data_loc = './data'

train_data_base = pd.read_csv(os.path.join(data_loc,'train.csv'), index_col='Id')
test_data_base = pd.read_csv(os.path.join(data_loc,'test.csv'), index_col='Id')

In [3]:
# split a validation set from the full train set
train_set, val_set = train_test_split(train_data_base, test_size=0.2, random_state=42)

# copy the test set for preprocessing
test_set = test_data_base.copy()

In [4]:
train_data_base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

## Preprocessing

The same preprocessing steps are applied independantly to the train, test and validation set.

The data set containes nummerical and categorical values. There are a few features that have a grading system in words, which can easily be ranked, like from "poor" to "excellent". These can be mapped to a numerical value.

In [5]:
# mapping categorical to numerical features
map1 = {'Reg':0, 'IR1': 1, 'IR2':2, 'IR3':3}
map2 = {'Po':0, 'Fa':1, 'TA':2, 'Gd': 3, 'Ex':4}
map3 = {'Gtl':0, 'Mod':1, 'Sev':2}
map4 = {'NA':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd': 4, 'Ex':5}
map5 = {'NA':0, 'No':1, 'Mn':2, 'Av':3, 'Gd': 4}
map6 = {'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ': 4, 'ALQ':5, 'GLQ':6}
map7 = {'NA':0, 'Unf':1, 'RFn':2, 'Fin':3}
map8 = {'N':0, 'P':1, 'Y':2}
map9 = {'NA':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}

def MapNumerical(feature, mapping):
    train_set.loc[:, feature] = train_set[feature].map(mapping)
    val_set.loc[:, feature] = val_set[feature].map(mapping)
    test_set.loc[:, feature] = test_set[feature].map(mapping)

In [7]:
# assign the maps to the according categorical features
map_feature_lst = [('LotShape', map1), ('HeatingQC', map2), ('KitchenQual', map2), ('LandSlope', map3),
                   ('ExterQual', map2), ('ExterCond', map2), ('BsmtQual', map4), ('BsmtCond', map4), 
                   ('BsmtExposure', map5), ('BsmtFinType1', map6), ('BsmtFinType2', map6), ('FireplaceQu', map4),
                   ('GarageFinish', map7), ('GarageQual', map4), ('GarageCond', map4), ('PavedDrive', map8),
                   ('PoolQC', map9)]

for feature, mapping in map_feature_lst:
    MapNumerical(feature, mapping)

Also binary values can directely be encoded to 1/0 values.

In [31]:
binmap1 = {'Grvl':0, 'Pave':1}
binmap2 = {'N':0, 'Y':1}

MapNumerical('Street', binmap1)
MapNumerical('CentralAir', binmap2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [24]:
# get a list of all features which are now numerical
features_num = train_set.select_dtypes(include=['float64', 'int64']).columns.drop(['SalePrice']).tolist()

In [30]:
# define train, val and test data
X_train = train_set[features_num].to_numpy()
y_train = np.log(train_set['SalePrice']).to_numpy()

X_val = val_set[features_num].to_numpy()
y_val = np.log(val_set['SalePrice']).to_numpy()

X_test = test_set[features_num].to_numpy()

In [32]:
# set up the ML pipeline

# imputing strategy for missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# scale values
scaler = StandardScaler()

# regression
regressor = Ridge(alpha=50)

# pipeline
pipe = Pipeline([('imputer', imputer), ('scaler', scaler), ('regressor', regressor)])

In [33]:
# cross validation
cv_scores = (-cross_val_score(pipe, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))**0.5

print(cv_scores)
print(cv_scores.mean())

[0.12875578 0.15933174 0.12557816 0.18588798 0.26371593 0.1637568
 0.12494256 0.11219543 0.1553962  0.10757099]
0.15271315770076668


In [34]:
pipe.fit(X_train,y_train)

y_predict = np.exp(pipe.predict(X_test))

In [36]:
submission = pd.DataFrame(y_predict, columns=['SalePrice'])
submission['Id'] = test_data_base.index
submission['Id'].astype('int')

submission.to_csv('submission.csv', index=None)