# House Prices - Advanced Regression Techniques
## Predict sales prices and practice feature engineering, RFs, and gradient boosting
This project is from [Kaggle]("https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview").

### Practice Skills
- Creative feature engineering 
- Advanced regression techniques like random forest and gradient boosting

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_absolute_error

%matplotlib inline

Explore Data Analysis (EDA)

In [15]:
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [16]:
df_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

Data cleaning

In [17]:
df_train.columns = df_train.columns.str.lower()
df_train.head(2)

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [18]:
# translate mssubclass column to actual names
mssubclass_values = {
    20:	"1-STORY 1946 & NEWER ALL STYLES",
    30:	"1-STORY 1945 & OLDER",
    40: "1-STORY W/FINISHED ATTIC ALL AGES",
    45:	"1-1/2 STORY - UNFINISHED ALL AGES",
    50: "1-1/2 STORY FINISHED ALL AGES",
    60:	"2-STORY 1946 & NEWER",
    70:	"2-STORY 1945 & OLDER",
    75:	"2-1/2 STORY ALL AGES",
    80:	"SPLIT OR MULTI-LEVEL",
    85:	"SPLIT FOYER",
    90:	"DUPLEX - ALL STYLES AND AGES",
    120: "1-STORY PUD (Planned Unit Development) - 1946 & NEWER",
    150: "1-1/2 STORY PUD - ALL AGES",
    160: "2-STORY PUD - 1946 & NEWER",
    180: "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER",
    190: "2 FAMILY CONVERSION - ALL STYLES AND AGES",
}

df_train.mssubclass = df_train.mssubclass.map(mssubclass_values)
df_train.head(2)


Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,2-STORY 1946 & NEWER,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,1-STORY 1946 & NEWER ALL STYLES,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [19]:
# translate OverallQual, OverallCond column to actual names
overallqual_values = {
       10:	"Very Excellent",
       9:	"Excellent",
       8:	"Very Good",
       7:	"Good",
       6:	"Above Average",
       5:	"Average",
       4:	"Below Average",
       3:	"Fair",
       2:	"Poor",
       1:	"Very Poor",
}
df_train.overallqual = df_train.overallqual.map(overallqual_values)

overallcond_values = {
      10:	"Very Excellent",
       9:	"Excellent",
       8:	"Very Good",
       7:	"Good",
       6:	"Above Average",
       5:	"Average",
       4:	"Below Average",
       3:	"Fair",
       2:	"Poor",
       1:	"Very Poor",
}
df_train.overallcond = df_train.overallcond.map(overallcond_values)
df_train.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,2-STORY 1946 & NEWER,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,1-STORY 1946 & NEWER ALL STYLES,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,2-STORY 1946 & NEWER,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,2-STORY 1945 & OLDER,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,2-STORY 1946 & NEWER,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [20]:
for c in list(df_train.dtypes[df_train.dtypes == "object"].index):
    df_train[c] = df_train[c].str.replace(" ", "_").str.lower()

df_train.head(3)

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,2-story_1946_&_newer,rl,65.0,8450,pave,,reg,lvl,allpub,...,0,,,,0,2,2008,wd,normal,208500
1,2,1-story_1946_&_newer_all_styles,rl,80.0,9600,pave,,reg,lvl,allpub,...,0,,,,0,5,2007,wd,normal,181500
2,3,2-story_1946_&_newer,rl,68.0,11250,pave,,ir1,lvl,allpub,...,0,,,,0,9,2008,wd,normal,223500


In [21]:
# check the numerical columns
df_train.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,1460.0,730.0,422.0,1.0,366.0,730.0,1095.0,1460.0
lotfrontage,1201.0,70.0,24.0,21.0,59.0,69.0,80.0,313.0
lotarea,1460.0,10517.0,9981.0,1300.0,7554.0,9478.0,11602.0,215245.0
yearbuilt,1460.0,1971.0,30.0,1872.0,1954.0,1973.0,2000.0,2010.0
yearremodadd,1460.0,1985.0,21.0,1950.0,1967.0,1994.0,2004.0,2010.0
masvnrarea,1452.0,104.0,181.0,0.0,0.0,0.0,166.0,1600.0
bsmtfinsf1,1460.0,444.0,456.0,0.0,0.0,384.0,712.0,5644.0
bsmtfinsf2,1460.0,47.0,161.0,0.0,0.0,0.0,0.0,1474.0
bsmtunfsf,1460.0,567.0,442.0,0.0,223.0,478.0,808.0,2336.0
totalbsmtsf,1460.0,1057.0,439.0,0.0,796.0,992.0,1298.0,6110.0


In [22]:
# dont need the id
df_train = df_train[list(df_train.columns)[1:]]
df_train

Unnamed: 0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,2-story_1946_&_newer,rl,65.0,8450,pave,,reg,lvl,allpub,inside,...,0,,,,0,2,2008,wd,normal,208500
1,1-story_1946_&_newer_all_styles,rl,80.0,9600,pave,,reg,lvl,allpub,fr2,...,0,,,,0,5,2007,wd,normal,181500
2,2-story_1946_&_newer,rl,68.0,11250,pave,,ir1,lvl,allpub,inside,...,0,,,,0,9,2008,wd,normal,223500
3,2-story_1945_&_older,rl,60.0,9550,pave,,ir1,lvl,allpub,corner,...,0,,,,0,2,2006,wd,abnorml,140000
4,2-story_1946_&_newer,rl,84.0,14260,pave,,ir1,lvl,allpub,fr2,...,0,,,,0,12,2008,wd,normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2-story_1946_&_newer,rl,62.0,7917,pave,,reg,lvl,allpub,inside,...,0,,,,0,8,2007,wd,normal,175000
1456,1-story_1946_&_newer_all_styles,rl,85.0,13175,pave,,reg,lvl,allpub,inside,...,0,,mnprv,,0,2,2010,wd,normal,210000
1457,2-story_1945_&_older,rl,66.0,9042,pave,,reg,lvl,allpub,inside,...,0,,gdprv,shed,2500,5,2010,wd,normal,266500
1458,1-story_1946_&_newer_all_styles,rl,68.0,9717,pave,,reg,lvl,allpub,inside,...,0,,,,0,4,2010,wd,normal,142125


In [23]:
# split the train dataset
from sklearn.model_selection import train_test_split
df_train_full, df_train_test = train_test_split(df_train, test_size=0.2, random_state=11)
df_train_train, df_train_val = train_test_split(df_train_full, test_size=0.25,random_state=11)

y_train = df_train_train.saleprice
y_val = df_train_val.saleprice

In [24]:
df_train_train = df_train_train.reset_index(drop=True)
df_train_val = df_train_val.reset_index(drop=True)
df_train_test = df_train_test.reset_index(drop=True)

In [25]:
# delete target value
del df_train_train["saleprice"]
del df_train_val["saleprice"]

In [26]:
# fill nan with 0
df_train_train = df_train_train.fillna(0)
df_train_val = df_train_val.fillna(0)

In [27]:
#encode categorical variable
dict_train = df_train_train.to_dict(orient="records")
dict_val = df_train_val.to_dict(orient="records")