# Predicting sale price of houses

The aim of this project is to build a machine learning model to predict sale price of houses, based on multiple explanatory variables describing aspects of these houses.
The dataset used for this project is available on [Kaggle.com](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)


# House price prediction : Wrap up

This notebook is the conclusion of our project consisting in the following :
- 1\.  Data analysis
- 2\.  Feature engineering
- 3\.  Feature selection
- 4\.  Model building

In the following, we will gather all the steps previously from feature engineering to model buiding

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

import joblib

import warnings
warnings.simplefilter(action='ignore')

In [2]:
data = pd.read_csv('./House_Price/houseprice.csv')

print(data.shape)
data.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 1. Separating the dataset into train and test set

In [3]:
#SEED = 0

X_train, X_test, y_train, y_test = train_test_split(data, data["SalePrice"], test_size=0.1, random_state=0)

print(X_train.shape, X_test.shape)
X_train.head()

(1314, 81) (146, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
930,931,20,RL,73.0,8925,Pave,,IR1,HLS,AllPub,...,0,,,,0,7,2009,WD,Normal,201000
656,657,20,RL,72.0,10007,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,8,2008,WD,Normal,145500
45,46,120,RL,61.0,7658,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2010,WD,Normal,319900
1348,1349,20,RL,,16196,Pave,,IR3,Low,AllPub,...,0,,,,0,8,2007,WD,Normal,215000
55,56,20,RL,100.0,10175,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2008,WD,Normal,180500


In [4]:
features = pd.read_csv("selected_features.csv")

features = np.reshape(features.values.tolist(), (features.shape[0], ))

print('Number of features: ', len(features))

Number of features:  21


## 2. Engineering missing values

### categorical
Here we will replace the missing values for categorical features with the message "Missing"

In [5]:

list_cat_na = [col for col in features if X_train[col].isnull().sum() > 0 and X_train[col].dtypes == "O"]
list_cat_na

['MasVnrType',
 'BsmtQual',
 'BsmtExposure',
 'FireplaceQu',
 'GarageType',
 'GarageFinish']

In [6]:
X_train[list_cat_na] = X_train[list_cat_na].fillna('Missing')
X_test[list_cat_na] = X_test[list_cat_na].fillna('Missing')

In [7]:
X_train[list_cat_na].isnull().sum()

MasVnrType      0
BsmtQual        0
BsmtExposure    0
FireplaceQu     0
GarageType      0
GarageFinish    0
dtype: int64

### numerical

In [8]:
list_num_var = [col for col in features if X_train[col].isnull().sum() > 0 and X_train[col].dtypes != 'O']

In [9]:
list_num_var

[]

## 3. Temporal variables : elapsed time

In [10]:
def elapsed_years(df, var):
    df[var] = df['YrSold'] - df[var]
    
    return df

In [11]:
X_train = elapsed_years(X_train, 'YearRemodAdd')
X_test = elapsed_years(X_test, 'YearRemodAdd')

### numerical variables transformation

In [12]:
for col in ['LotFrontage', '1stFlrSF', 'GrLivArea', 'SalePrice']:
    X_train[col] = np.log(X_train[col])
    X_test[col] = np.log(X_test[col])

## 4. Categorical variables : treatment of rare labels and encoding

In [14]:
cat_vars = [col for col in features if X_train[col].dtype == 'O']

cat_vars

['MSZoning',
 'Neighborhood',
 'RoofStyle',
 'MasVnrType',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'PavedDrive']

In [19]:
def find_frequent_label(df, col, tolerance):
    df = df.copy()
    
    tmp = df.groupby(var)["SalePrice"].count() / len(df)
    return tmp[tmp > tolerance].index
    
for var in cat_vars:
    freq_lb = find_frequent_label(X_train, var, 0.01)
    print(var)
    print(freq_lb, "\n")
    
    X_train[var] = np.where(X_train[var].isin(freq_lb), X_train[var], "rare")
    X_test[var] = np.where(X_test[var].isin(freq_lb), X_test[var], "rare")
    

MSZoning
Index(['FV', 'RH', 'RL', 'RM'], dtype='object', name='MSZoning') 

Neighborhood
Index(['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor',
       'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes',
       'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW',
       'Somerst', 'StoneBr', 'Timber'],
      dtype='object', name='Neighborhood') 

RoofStyle
Index(['Gable', 'Hip'], dtype='object', name='RoofStyle') 

MasVnrType
Index(['BrkFace', 'None', 'Stone'], dtype='object', name='MasVnrType') 

BsmtQual
Index(['Ex', 'Fa', 'Gd', 'Missing', 'TA'], dtype='object', name='BsmtQual') 

BsmtExposure
Index(['Av', 'Gd', 'Missing', 'Mn', 'No'], dtype='object', name='BsmtExposure') 

HeatingQC
Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='HeatingQC') 

CentralAir
Index(['N', 'Y'], dtype='object', name='CentralAir') 

KitchenQual
Index(['Ex', 'Fa', 'Gd', 'TA'], dtype='object', name='KitchenQual') 

FireplaceQu
Index(['Ex', 'Fa', 'Gd', 'Miss

In [20]:
def replace_cat(train, test, col):
    
    ordered_labels = train.groupby([var])["SalePrice"].mean().sort_values().index

    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}

    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)
    
    print(var)
    print(ordinal_label)
    print()
    return 1

for var in cat_vars:
    replace_cat(X_train, X_test, var)



MSZoning
{'rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4}

Neighborhood
{'IDOTRR': 0, 'MeadowV': 1, 'BrDale': 2, 'Edwards': 3, 'BrkSide': 4, 'OldTown': 5, 'Sawyer': 6, 'SWISU': 7, 'NAmes': 8, 'Mitchel': 9, 'SawyerW': 10, 'rare': 11, 'NWAmes': 12, 'Gilbert': 13, 'Blmngtn': 14, 'CollgCr': 15, 'Crawfor': 16, 'ClearCr': 17, 'Somerst': 18, 'Timber': 19, 'StoneBr': 20, 'NridgHt': 21, 'NoRidge': 22}

RoofStyle
{'Gable': 0, 'rare': 1, 'Hip': 2}

MasVnrType
{'None': 0, 'rare': 1, 'BrkFace': 2, 'Stone': 3}

BsmtQual
{'Missing': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}

BsmtExposure
{'Missing': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}

HeatingQC
{'rare': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}

CentralAir
{'N': 0, 'Y': 1}

KitchenQual
{'Fa': 0, 'TA': 1, 'Gd': 2, 'Ex': 3}

FireplaceQu
{'Po': 0, 'Missing': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}

GarageType
{'Missing': 0, 'rare': 1, 'Detchd': 2, 'Basment': 3, 'Attchd': 4, 'BuiltIn': 5}

GarageFinish
{'Missing': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

PavedDrive

## 5. Feature scaling

In [21]:
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

scaler = MinMaxScaler()

scaler.fit(X_train[features])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [23]:
X_train = scaler.transform(X_train[features])
X_test = scaler.transform(X_test[features])

## 6. Linear model : Lasso

In [24]:
#SEED = 0

model = Lasso(alpha=0.005, random_state=0)

model.fit(X_train, y_train)

joblib.dump(model, 'lasso_regression.pkl')

['lasso_regression.pkl']

In [25]:
pred = model.predict(X_train)


print('train mse: {}'.format(int(
    mean_squared_error(np.exp(y_train), np.exp(pred)))))
print('train rmse: {}'.format(int(
    sqrt(mean_squared_error(np.exp(y_train), np.exp(pred))))))
print('train r2: {}'.format(
    r2_score(np.exp(y_train), np.exp(pred))))
print("\n")

pred = model.predict(X_test)

print('test mse: {}'.format(int(
    mean_squared_error(np.exp(y_test), np.exp(pred)))))
print('test rmse: {}'.format(int(
    sqrt(mean_squared_error(np.exp(y_test), np.exp(pred))))))
print('test r2: {}'.format(
    r2_score(np.exp(y_test), np.exp(pred))))
print("\n")

print('Average house price: ', int(np.exp(y_train).median()))

train mse: 1095464701
train rmse: 33097
train r2: 0.8245524987165792


test mse: 1415749527
test rmse: 37626
test r2: 0.7939863537248245


Average house price:  163000
