# House Price Prediction Project
### 1. Problem Definition:
- Goal: predict the sales price for each house

### 2. Feature Selection:
- Choose features to train ML model
- Need to use `Feature Engineering` to identify Features needed

### 3. Splitting the datasets

##### 3.1 dataset -> X, y
- `data` : dataset
- `X` : `data[features]`
- `y` : target variable `SalePrice`

##### 3.2 X, y -> X_train, y_train, X_valid, y_valid

### 4. Training Machine Learning Model

### 5. Model Evaluation

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./data/train.csv")

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
data = pd.read_csv("./data/train.csv", index_col = "Id")

In [5]:
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
data.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

## 2. Feature Selections

In [17]:
features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]

## 3. Splitting dataset into X and y

In [21]:
X = data[features]
y = data["SalePrice"]

In [22]:
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [23]:
X.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,8450,2003,856,854,2,3,8
2,9600,1976,1262,0,2,3,6
3,11250,2001,920,866,2,3,6
4,9550,1915,961,756,1,3,7
5,14260,2000,1145,1053,2,4,9


In [24]:
y.head()

Id
1    208500
2    181500
3    223500
4    140000
5    250000
Name: SalePrice, dtype: int64

### X, y -> X_train, y_train, X_valid, y_valid

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [27]:
X_train.shape

(1168, 7)

In [28]:
X.shape

(1460, 7)

In [29]:
data.shape

(1460, 80)

In [30]:
X_valid.shape

(292, 7)

## 4. Training Machine Learning Model

In [31]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state = 1)

In [32]:
# Fit training data into model
dt_model.fit(X_train, y_train)

In [44]:
y_preds = dt_model.predict(X_valid)

In [45]:
y_preds

array([335000., 140200., 119000., 207500., 112000.,  58500., 232600.,
       119500., 755000., 156932., 213500., 100000., 244000., 130000.,
       144900., 123000., 236000., 126000., 154500., 128000., 141500.,
       149000.,  62383., 186500., 139000., 135000., 195000.,  92000.,
       271000., 105500., 141000., 178000., 110000., 258000., 253293.,
       177000., 190000., 130000., 210000., 315750., 237500., 174000.,
       176000., 326000., 290000., 142600., 105500., 142500., 146500.,
       135000., 315750., 124000., 218000., 126000., 184000., 130000.,
       119500., 226700., 160000.,  87000., 142000.,  55993., 140000.,
       155000., 290000., 174000.,  97000., 205950., 147000., 207500.,
       176000., 109000., 139000., 194500., 115000., 230000., 132500.,
       109500., 325300.,  90000., 135960., 134900., 127000., 147000.,
       197000., 194700.,  86000., 202500., 198900.,  60000., 188000.,
       177000., 174000., 238000., 206000.,  80000., 190000., 223500.,
        88000., 1350

In [47]:
pd.DataFrame({'y' : y_valid, 'y_preds' : y_preds})

Unnamed: 0_level_0,y,y_preds
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
530,200624,335000.0
492,133000,140200.0
460,110000,119000.0
280,192000,207500.0
656,88000,112000.0
...,...,...
327,324000,260400.0
441,555000,451950.0
1388,136000,107500.0
1324,82500,72500.0


In [48]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
rf_model = RandomForestRegressor(random_state = 1)
rf_model.fit(X_train, y_train)

In [49]:
rf_val_preds = rf_model.predict(X_valid)

In [50]:
rf_val_preds[:5]

array([271690., 155039., 122024., 188915.,  91147.])

### Predict with a new input

In [51]:
X_valid.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
530,32668,1957,2515,0,3,4,9
492,9490,1941,958,620,1,3,5
460,7015,1950,979,224,1,3,5
280,10005,1977,1156,866,2,4,8
656,1680,1971,525,567,1,3,6


In [52]:
rf_model.predict([[6969, 2021, 1000, 800, 4, 5, 8]])



array([206158.4])

## 5. Model Evaluation

In [53]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Decision Tree Model Evaluation
dt_mae = mean_absolute_error(y_valid, y_preds)
dt_mse = mean_squared_error(y_valid, y_preds)
dt_r2 = r2_score(y_valid, y_preds)

print("Decision Tree Model Evaluation:")
print("Mean Absolute Error (MAE):", dt_mae)
print("Mean Squared Error (MSE):", dt_mse)
print("R-squared (R^2):", dt_r2)

# Random Forest Model Evaluation
rf_val_preds = rf_model.predict(X_valid)
rf_mae = mean_absolute_error(y_valid, rf_val_preds)
rf_mse = mean_squared_error(y_valid, rf_val_preds)
rf_r2 = r2_score(y_valid, rf_val_preds)

print("\nRandom Forest Model Evaluation:")
print("Mean Absolute Error (MAE):", rf_mae)
print("Mean Squared Error (MSE):", rf_mse)
print("R-squared (R^2):", rf_r2)


Decision Tree Model Evaluation:
Mean Absolute Error (MAE): 33842.32876712329
Mean Squared Error (MSE): 3228225028.8561645
R-squared (R^2): 0.5325376245236801

Random Forest Model Evaluation:
Mean Absolute Error (MAE): 24069.385498858446
Mean Squared Error (MSE): 1489870679.523267
R-squared (R^2): 0.7842596223072973
