# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

## Install necessary packages

In [None]:
# !conda install numpy pandas matplotlib seaborn missingno scikit-learn pandas_profiling -y

In [None]:
# !conda install -c conda-forge xgboost -y

## Import necessary packages

In [3]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

## Read in data

In [4]:
train_file_path = "./../../data/hpc_train.csv"
test_file_path = "./../../data/hpc_test.csv"

In [5]:
train = pd.read_csv(train_file_path, 
                    index_col='Id')
train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [6]:
test = pd.read_csv(test_file_path, 
                   index_col='Id')
test

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [7]:
train.MSSubClass = train.MSSubClass.astype('category')

## Exploratory Data Analysis


### Missing values

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSSubClass     1460 non-null   category
 1   MSZoning       1460 non-null   object  
 2   LotFrontage    1201 non-null   float64 
 3   LotArea        1460 non-null   int64   
 4   Street         1460 non-null   object  
 5   Alley          91 non-null     object  
 6   LotShape       1460 non-null   object  
 7   LandContour    1460 non-null   object  
 8   Utilities      1460 non-null   object  
 9   LotConfig      1460 non-null   object  
 10  LandSlope      1460 non-null   object  
 11  Neighborhood   1460 non-null   object  
 12  Condition1     1460 non-null   object  
 13  Condition2     1460 non-null   object  
 14  BldgType       1460 non-null   object  
 15  HouseStyle     1460 non-null   object  
 16  OverallQual    1460 non-null   int64   
 17  OverallCond    1460 non-null   in

In [17]:
cat_col_names_with_na = []
cat_col_names_without_na = []
for col_name in train.select_dtypes(exclude=['number']):
    print(f"Name: {col_name}\tdtype: {train[col_name].dtype}")
    print(f"nunique: {train[col_name].nunique()}")
    missing_count = train[col_name].isnull().sum()
    count = train[col_name].count()
    if missing_count:
        cat_col_names_with_na.append(col_name)
        print(f"Missing: {missing_count} / {count} -> {missing_count/(missing_count + count)*100}% ")    
    else:
        cat_col_names_without_na.append(col_name)
    display(train[col_name].value_counts(sort=False))
    
    print('-' * 30, '\n')

Name: MSSubClass	dtype: category
nunique: 15


20     536
30      69
40       4
45      12
50     144
60     299
70      60
75      16
80      58
85      20
90      52
120     87
160     63
180     10
190     30
Name: MSSubClass, dtype: int64

------------------------------ 

Name: MSZoning	dtype: object
nunique: 5


C (all)      10
FV           65
RH           16
RL         1151
RM          218
Name: MSZoning, dtype: int64

------------------------------ 

Name: Street	dtype: object
nunique: 2


Pave    1454
Grvl       6
Name: Street, dtype: int64

------------------------------ 

Name: Alley	dtype: object
nunique: 3


Pave          41
No alley    1369
Grvl          50
Name: Alley, dtype: int64

------------------------------ 

Name: LotShape	dtype: object
nunique: 4


IR2     41
IR3     10
Reg    925
IR1    484
Name: LotShape, dtype: int64

------------------------------ 

Name: LandContour	dtype: object
nunique: 4


Bnk      63
HLS      50
Low      36
Lvl    1311
Name: LandContour, dtype: int64

------------------------------ 

Name: Utilities	dtype: object
nunique: 2


NoSeWa       1
AllPub    1459
Name: Utilities, dtype: int64

------------------------------ 

Name: LotConfig	dtype: object
nunique: 5


FR2          47
Inside     1052
CulDSac      94
FR3           4
Corner      263
Name: LotConfig, dtype: int64

------------------------------ 

Name: LandSlope	dtype: object
nunique: 3


Mod      65
Sev      13
Gtl    1382
Name: LandSlope, dtype: int64

------------------------------ 

Name: Neighborhood	dtype: object
nunique: 25


NridgHt     77
BrkSide     58
NWAmes      73
Gilbert     79
SWISU       25
Edwards    100
OldTown    113
Blueste      2
SawyerW     59
Veenker     11
Mitchel     49
Crawfor     51
ClearCr     28
StoneBr     25
Somerst     86
NAmes      225
Timber      38
CollgCr    150
MeadowV     17
BrDale      16
IDOTRR      37
Blmngtn     17
NPkVill      9
NoRidge     41
Sawyer      74
Name: Neighborhood, dtype: int64

------------------------------ 

Name: Condition1	dtype: object
nunique: 9


Norm      1260
RRNn         5
PosA         8
RRAe        11
Feedr       81
Artery      48
RRNe         2
PosN        19
RRAn        26
Name: Condition1, dtype: int64

------------------------------ 

Name: Condition2	dtype: object
nunique: 8


Norm      1445
RRNn         2
PosA         1
RRAe         1
Feedr        6
Artery       2
PosN         2
RRAn         1
Name: Condition2, dtype: int64

------------------------------ 

Name: BldgType	dtype: object
nunique: 5


2fmCon      31
TwnhsE     114
Twnhs       43
1Fam      1220
Duplex      52
Name: BldgType, dtype: int64

------------------------------ 

Name: HouseStyle	dtype: object
nunique: 8


2.5Fin      8
1.5Fin    154
1Story    726
2.5Unf     11
SFoyer     37
2Story    445
1.5Unf     14
SLvl       65
Name: HouseStyle, dtype: int64

------------------------------ 

Name: RoofStyle	dtype: object
nunique: 6


Mansard       7
Shed          2
Gable      1141
Gambrel      11
Hip         286
Flat         13
Name: RoofStyle, dtype: int64

------------------------------ 

Name: RoofMatl	dtype: object
nunique: 8


Roll          1
CompShg    1434
WdShake       5
Tar&Grv      11
Membran       1
Metal         1
WdShngl       6
ClyTile       1
Name: RoofMatl, dtype: int64

------------------------------ 

Name: Exterior1st	dtype: object
nunique: 15


Wd Sdng    206
AsbShng     20
Plywood    108
HdBoard    222
CemntBd     61
Stone        2
CBlock       1
VinylSd    515
MetalSd    220
AsphShn      1
WdShing     26
BrkComm      2
Stucco      25
BrkFace     50
ImStucc      1
Name: Exterior1st, dtype: int64

------------------------------ 

Name: Exterior2nd	dtype: object
nunique: 16


Wd Sdng    197
AsbShng     20
Plywood    142
HdBoard    207
Wd Shng     38
Stone        5
CBlock       1
VinylSd    504
MetalSd    214
Other        1
AsphShn      3
Stucco      26
CmentBd     60
BrkFace     25
ImStucc     10
Brk Cmn      7
Name: Exterior2nd, dtype: int64

------------------------------ 

Name: MasVnrType	dtype: object
nunique: 5


BrkCmn      15
Stone      128
No           8
BrkFace    445
None       864
Name: MasVnrType, dtype: int64

------------------------------ 

Name: ExterQual	dtype: object
nunique: 4


Fa     14
TA    906
Gd    488
Ex     52
Name: ExterQual, dtype: int64

------------------------------ 

Name: ExterCond	dtype: object
nunique: 5


Fa      28
TA    1282
Gd     146
Ex       3
Po       1
Name: ExterCond, dtype: int64

------------------------------ 

Name: Foundation	dtype: object
nunique: 6


Slab       24
Stone       6
CBlock    634
PConc     647
Wood        3
BrkTil    146
Name: Foundation, dtype: int64

------------------------------ 

Name: BsmtQual	dtype: object
nunique: 5


Fa              35
No Basement     37
TA             649
Gd             618
Ex             121
Name: BsmtQual, dtype: int64

------------------------------ 

Name: BsmtCond	dtype: object
nunique: 5


Fa               45
No Basement      37
TA             1311
Gd               65
Po                2
Name: BsmtCond, dtype: int64

------------------------------ 

Name: BsmtExposure	dtype: object
nunique: 5


No Basement     38
Mn             114
Gd             134
No             953
Av             221
Name: BsmtExposure, dtype: int64

------------------------------ 

Name: BsmtFinType1	dtype: object
nunique: 7


Rec            133
ALQ            220
Unf            430
No Basement     37
BLQ            148
GLQ            418
LwQ             74
Name: BsmtFinType1, dtype: int64

------------------------------ 

Name: BsmtFinType2	dtype: object
nunique: 7


Rec              54
ALQ              19
Unf            1256
No Basement      38
BLQ              33
GLQ              14
LwQ              46
Name: BsmtFinType2, dtype: int64

------------------------------ 

Name: Heating	dtype: object
nunique: 6


Grav        7
Floor       1
OthW        2
GasA     1428
Wall        4
GasW       18
Name: Heating, dtype: int64

------------------------------ 

Name: HeatingQC	dtype: object
nunique: 5


Fa     49
TA    428
Gd    241
Ex    741
Po      1
Name: HeatingQC, dtype: int64

------------------------------ 

Name: CentralAir	dtype: object
nunique: 2


Y    1365
N      95
Name: CentralAir, dtype: int64

------------------------------ 

Name: Electrical	dtype: object
nunique: 5


FuseP       3
FuseF      27
SBrkr    1335
FuseA      94
Mix         1
Name: Electrical, dtype: int64

------------------------------ 

Name: KitchenQual	dtype: object
nunique: 4


Fa     39
TA    735
Gd    586
Ex    100
Name: KitchenQual, dtype: int64

------------------------------ 

Name: Functional	dtype: object
nunique: 7


Min2      34
Mod       15
Sev        1
Maj1      14
Min1      31
Typ     1360
Maj2       5
Name: Functional, dtype: int64

------------------------------ 

Name: FireplaceQu	dtype: object
nunique: 6


Fa               33
TA              313
No Fireplace    690
Gd              380
Ex               24
Po               20
Name: FireplaceQu, dtype: int64

------------------------------ 

Name: GarageType	dtype: object
nunique: 7


Attchd       870
Basment       19
Detchd       387
2Types         6
No Garage     81
BuiltIn       88
CarPort        9
Name: GarageType, dtype: int64

------------------------------ 

Name: GarageFinish	dtype: object
nunique: 4


Fin          352
Unf          605
RFn          422
No Garage     81
Name: GarageFinish, dtype: int64

------------------------------ 

Name: GarageQual	dtype: object
nunique: 6


Fa             48
TA           1311
Gd             14
Ex              3
No Garage      81
Po              3
Name: GarageQual, dtype: int64

------------------------------ 

Name: GarageCond	dtype: object
nunique: 6


Fa             35
TA           1326
Gd              9
Ex              2
No Garage      81
Po              7
Name: GarageCond, dtype: int64

------------------------------ 

Name: PavedDrive	dtype: object
nunique: 3


Y    1340
N      90
P      30
Name: PavedDrive, dtype: int64

------------------------------ 

Name: PoolQC	dtype: object
nunique: 4


Fa            2
No Pool    1453
Gd            3
Ex            2
Name: PoolQC, dtype: int64

------------------------------ 

Name: Fence	dtype: object
nunique: 5


GdPrv         59
No Fence    1179
MnPrv        157
GdWo          54
MnWw          11
Name: Fence, dtype: int64

------------------------------ 

Name: MiscFeature	dtype: object
nunique: 5


Shed      49
Gar2       2
Othr       2
TenC       1
No      1406
Name: MiscFeature, dtype: int64

------------------------------ 

Name: SaleType	dtype: object
nunique: 9


ConLD       9
WD       1267
CWD         4
New       122
ConLw       5
Oth         3
ConLI       5
Con         2
COD        43
Name: SaleType, dtype: int64

------------------------------ 

Name: SaleCondition	dtype: object
nunique: 6


AdjLand       4
Normal     1198
Partial     125
Abnorml     101
Alloca       12
Family       20
Name: SaleCondition, dtype: int64

------------------------------ 



In [10]:
cat_col_names_without_na

['MSSubClass',
 'MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSSubClass     1460 non-null   category
 1   MSZoning       1460 non-null   object  
 2   LotFrontage    1201 non-null   float64 
 3   LotArea        1460 non-null   int64   
 4   Street         1460 non-null   object  
 5   Alley          91 non-null     object  
 6   LotShape       1460 non-null   object  
 7   LandContour    1460 non-null   object  
 8   Utilities      1460 non-null   object  
 9   LotConfig      1460 non-null   object  
 10  LandSlope      1460 non-null   object  
 11  Neighborhood   1460 non-null   object  
 12  Condition1     1460 non-null   object  
 13  Condition2     1460 non-null   object  
 14  BldgType       1460 non-null   object  
 15  HouseStyle     1460 non-null   object  
 16  OverallQual    1460 non-null   int64   
 17  OverallCond    1460 non-null   in

In [12]:
cat_col_names_with_na

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [13]:
train.Alley.fillna('No alley', inplace=True)
train.MasVnrType.fillna('No', inplace=True)
train.BsmtQual.fillna('No Basement', inplace=True)
train.BsmtCond.fillna('No Basement', inplace=True)
train.BsmtExposure.fillna('No Basement', inplace=True)
train.BsmtFinType1.fillna('No Basement', inplace=True)
train.BsmtFinType2.fillna('No Basement', inplace=True)
train.FireplaceQu.fillna('No Fireplace', inplace=True)
train.GarageType.fillna('No Garage', inplace=True)
train.GarageFinish.fillna('No Garage', inplace=True)
train.GarageQual.fillna('No Garage', inplace=True)
train.GarageCond.fillna('No Garage', inplace=True)
train.PoolQC.fillna('No Pool', inplace=True)
train.Fence.fillna('No Fence', inplace=True)
train.MiscFeature.fillna('No', inplace=True)

In [14]:
train.Electrical.fillna(train.Electrical.mode()[0], inplace=True)

### Data types

#### object -> categorical

In [15]:
train.MSZoning = train.MSZoning.astype("category", 
                                       categories=['A', 
                                                   'C', 
                                                   'FV', 
                                                   'I', 
                                                   'RH', 
                                                   'RL', 
                                                   'RP', 
                                                   'RM'], 
                                       ordered=False)

train.MSSubClass = train.MSSubClass.astype("category", 
                                           categories=[20, 30, 
                                                       40, 45, 
                                                       50, 60, 
                                                       70, 75, 
                                                       80, 85, 
                                                       90, 120, 
                                                       150, 160, 
                                                       180, 190], 
                                           ordered=False)

train.Street = train.Street.astype("category", ordered=False)

train.Alley = train.Alley.astype("category", 
                                 ordered=False)

train.LotShape = train.LotShape.astype("category", 
                                       categories=['Reg', 
                                                   'IR1', 
                                                   'IR2', 
                                                   'IR3'], 
                                       ordered=False)

train.LandContour = train.LandContour.astype("category", 
                                       categories=[''], 
                                       ordered=True)

train. = train..astype("category", 
                                       categories=[], 
                                       ordered=True)

train. = train..astype("category", 
                                       categories=[], 
                                       ordered=True)

train. = train..astype("category", 
                                       categories=[], 
                                       ordered=True)

train. = train..astype("category", 
                                       categories=[], 
                                       ordered=True)

train. = train..astype("category", 
                                       categories=[], 
                                       ordered=True)

train. = train..astype("category", 
                                       categories=[], 
                                       ordered=True)


SyntaxError: invalid syntax (<ipython-input-15-95db40fb5b25>, line 39)