# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

## Install necessary packages

In [1]:
# !conda install numpy pandas matplotlib seaborn missingno scikit-learn pandas_profiling -y

In [2]:
# !conda install -c conda-forge xgboost -y

## Import necessary packages

In [3]:
import numpy as np
import pandas as pd

from pandas.api.types import CategoricalDtype

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

## Read in data

In [4]:
train_file_path = "./../../data/hpc_train.csv"
test_file_path = "./../../data/hpc_test.csv"

In [5]:
pd.options.display.max_columns = None

In [6]:
train = pd.read_csv(train_file_path, 
                    index_col='Id')
train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


In [7]:
test = pd.read_csv(test_file_path, 
                   index_col='Id')

In [8]:
train.MSSubClass = train.MSSubClass.astype('category')

train.OverallQual = train.OverallQual.astype(CategoricalDtype(categories=list(range(1, 11)),
                                                              ordered=True))

train.OverallCond = train.OverallCond.astype(CategoricalDtype(categories=list(range(1, 11)), 
                                                              ordered=True))

In [9]:
train.YearBuilt = pd.to_datetime(train.YearBuilt, format='%Y')

train.YearRemodAdd = pd.to_datetime(train.YearRemodAdd, format='%Y')

train.GarageYrBlt = pd.to_datetime(train.GarageYrBlt, format='%Y')

train.MoSold = pd.to_datetime(train.MoSold, format='%M')
train.YrSold = pd.to_datetime(train.YrSold, format='%Y')

In [10]:
yes_no_map = {'Y': True, 'N': False}

train.CentralAir = train.CentralAir.map(yes_no_map).astype('bool')

## Exploratory Data Analysis


### Missing values

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSSubClass     1460 non-null   category      
 1   MSZoning       1460 non-null   object        
 2   LotFrontage    1201 non-null   float64       
 3   LotArea        1460 non-null   int64         
 4   Street         1460 non-null   object        
 5   Alley          91 non-null     object        
 6   LotShape       1460 non-null   object        
 7   LandContour    1460 non-null   object        
 8   Utilities      1460 non-null   object        
 9   LotConfig      1460 non-null   object        
 10  LandSlope      1460 non-null   object        
 11  Neighborhood   1460 non-null   object        
 12  Condition1     1460 non-null   object        
 13  Condition2     1460 non-null   object        
 14  BldgType       1460 non-null   object        
 15  HouseStyle     1460 n

In [12]:
cat_col_names_with_na = []
cat_col_names_without_na = []
for col_name in train.select_dtypes(include=['object']):
    print(f"Name: {col_name}\tdtype: {train[col_name].dtype}")
    print(f"nunique: {train[col_name].nunique()}")
    missing_count = train[col_name].isnull().sum()
    count = train[col_name].count()
    if missing_count:
        cat_col_names_with_na.append(col_name)
        print(f"Missing: {missing_count} / {count} -> {missing_count/(missing_count + count)*100}% ")    
    else:
        cat_col_names_without_na.append(col_name)
    display(train[col_name].value_counts(sort=False))
    
    print('-' * 30, '\n')

Name: MSZoning	dtype: object
nunique: 5


RM          218
C (all)      10
RH           16
FV           65
RL         1151
Name: MSZoning, dtype: int64

------------------------------ 

Name: Street	dtype: object
nunique: 2


Pave    1454
Grvl       6
Name: Street, dtype: int64

------------------------------ 

Name: Alley	dtype: object
nunique: 2
Missing: 1369 / 91 -> 93.76712328767123% 


Pave    41
Grvl    50
Name: Alley, dtype: int64

------------------------------ 

Name: LotShape	dtype: object
nunique: 4


IR1    484
Reg    925
IR3     10
IR2     41
Name: LotShape, dtype: int64

------------------------------ 

Name: LandContour	dtype: object
nunique: 4


Lvl    1311
HLS      50
Low      36
Bnk      63
Name: LandContour, dtype: int64

------------------------------ 

Name: Utilities	dtype: object
nunique: 2


NoSeWa       1
AllPub    1459
Name: Utilities, dtype: int64

------------------------------ 

Name: LotConfig	dtype: object
nunique: 5


Inside     1052
CulDSac      94
FR2          47
Corner      263
FR3           4
Name: LotConfig, dtype: int64

------------------------------ 

Name: LandSlope	dtype: object
nunique: 3


Mod      65
Gtl    1382
Sev      13
Name: LandSlope, dtype: int64

------------------------------ 

Name: Neighborhood	dtype: object
nunique: 25


Blueste      2
Crawfor     51
Veenker     11
StoneBr     25
BrkSide     58
Somerst     86
Gilbert     79
Blmngtn     17
MeadowV     17
Timber      38
BrDale      16
Mitchel     49
SawyerW     59
CollgCr    150
Sawyer      74
NPkVill      9
OldTown    113
ClearCr     28
NridgHt     77
NoRidge     41
NAmes      225
Edwards    100
IDOTRR      37
SWISU       25
NWAmes      73
Name: Neighborhood, dtype: int64

------------------------------ 

Name: Condition1	dtype: object
nunique: 9


PosA         8
PosN        19
Norm      1260
RRAe        11
RRNe         2
Feedr       81
Artery      48
RRAn        26
RRNn         5
Name: Condition1, dtype: int64

------------------------------ 

Name: Condition2	dtype: object
nunique: 8


PosA         1
PosN         2
Norm      1445
RRAe         1
Feedr        6
Artery       2
RRAn         1
RRNn         2
Name: Condition2, dtype: int64

------------------------------ 

Name: BldgType	dtype: object
nunique: 5


1Fam      1220
2fmCon      31
TwnhsE     114
Duplex      52
Twnhs       43
Name: BldgType, dtype: int64

------------------------------ 

Name: HouseStyle	dtype: object
nunique: 8


2.5Unf     11
2.5Fin      8
1Story    726
SFoyer     37
1.5Unf     14
SLvl       65
1.5Fin    154
2Story    445
Name: HouseStyle, dtype: int64

------------------------------ 

Name: RoofStyle	dtype: object
nunique: 6


Shed          2
Mansard       7
Hip         286
Gambrel      11
Flat         13
Gable      1141
Name: RoofStyle, dtype: int64

------------------------------ 

Name: RoofMatl	dtype: object
nunique: 8


WdShngl       6
CompShg    1434
Metal         1
Membran       1
ClyTile       1
WdShake       5
Tar&Grv      11
Roll          1
Name: RoofMatl, dtype: int64

------------------------------ 

Name: Exterior1st	dtype: object
nunique: 15


WdShing     26
Plywood    108
HdBoard    222
VinylSd    515
Stone        2
BrkComm      2
Wd Sdng    206
BrkFace     50
MetalSd    220
ImStucc      1
CBlock       1
AsbShng     20
Stucco      25
CemntBd     61
AsphShn      1
Name: Exterior1st, dtype: int64

------------------------------ 

Name: Exterior2nd	dtype: object
nunique: 16


Wd Shng     38
HdBoard    207
VinylSd    504
Brk Cmn      7
Stone        5
Wd Sdng    197
Plywood    142
BrkFace     25
CmentBd     60
MetalSd    214
ImStucc     10
CBlock       1
AsbShng     20
Stucco      26
AsphShn      3
Other        1
Name: Exterior2nd, dtype: int64

------------------------------ 

Name: MasVnrType	dtype: object
nunique: 4
Missing: 8 / 1452 -> 0.547945205479452% 


BrkCmn      15
Stone      128
None       864
BrkFace    445
Name: MasVnrType, dtype: int64

------------------------------ 

Name: ExterQual	dtype: object
nunique: 4


TA    906
Gd    488
Ex     52
Fa     14
Name: ExterQual, dtype: int64

------------------------------ 

Name: ExterCond	dtype: object
nunique: 5


TA    1282
Gd     146
Po       1
Ex       3
Fa      28
Name: ExterCond, dtype: int64

------------------------------ 

Name: Foundation	dtype: object
nunique: 6


Stone       6
Slab       24
PConc     647
CBlock    634
BrkTil    146
Wood        3
Name: Foundation, dtype: int64

------------------------------ 

Name: BsmtQual	dtype: object
nunique: 4
Missing: 37 / 1423 -> 2.5342465753424657% 


TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

------------------------------ 

Name: BsmtCond	dtype: object
nunique: 4
Missing: 37 / 1423 -> 2.5342465753424657% 


TA    1311
Gd      65
Po       2
Fa      45
Name: BsmtCond, dtype: int64

------------------------------ 

Name: BsmtExposure	dtype: object
nunique: 4
Missing: 38 / 1422 -> 2.6027397260273974% 


Mn    114
Av    221
Gd    134
No    953
Name: BsmtExposure, dtype: int64

------------------------------ 

Name: BsmtFinType1	dtype: object
nunique: 6
Missing: 37 / 1423 -> 2.5342465753424657% 


BLQ    148
ALQ    220
Unf    430
GLQ    418
Rec    133
LwQ     74
Name: BsmtFinType1, dtype: int64

------------------------------ 

Name: BsmtFinType2	dtype: object
nunique: 6
Missing: 38 / 1422 -> 2.6027397260273974% 


BLQ      33
ALQ      19
Unf    1256
GLQ      14
Rec      54
LwQ      46
Name: BsmtFinType2, dtype: int64

------------------------------ 

Name: Heating	dtype: object
nunique: 6


Floor       1
OthW        2
Grav        7
Wall        4
GasW       18
GasA     1428
Name: Heating, dtype: int64

------------------------------ 

Name: HeatingQC	dtype: object
nunique: 5


TA    428
Gd    241
Po      1
Ex    741
Fa     49
Name: HeatingQC, dtype: int64

------------------------------ 

Name: Electrical	dtype: object
nunique: 5
Missing: 1 / 1459 -> 0.0684931506849315% 


FuseP       3
FuseF      27
FuseA      94
Mix         1
SBrkr    1334
Name: Electrical, dtype: int64

------------------------------ 

Name: KitchenQual	dtype: object
nunique: 4


TA    735
Gd    586
Ex    100
Fa     39
Name: KitchenQual, dtype: int64

------------------------------ 

Name: Functional	dtype: object
nunique: 7


Maj1      14
Typ     1360
Mod       15
Min2      34
Maj2       5
Min1      31
Sev        1
Name: Functional, dtype: int64

------------------------------ 

Name: FireplaceQu	dtype: object
nunique: 5
Missing: 690 / 770 -> 47.26027397260274% 


TA    313
Gd    380
Po     20
Ex     24
Fa     33
Name: FireplaceQu, dtype: int64

------------------------------ 

Name: GarageType	dtype: object
nunique: 6
Missing: 81 / 1379 -> 5.5479452054794525% 


CarPort      9
BuiltIn     88
2Types       6
Attchd     870
Detchd     387
Basment     19
Name: GarageType, dtype: int64

------------------------------ 

Name: GarageFinish	dtype: object
nunique: 3
Missing: 81 / 1379 -> 5.5479452054794525% 


Unf    605
RFn    422
Fin    352
Name: GarageFinish, dtype: int64

------------------------------ 

Name: GarageQual	dtype: object
nunique: 5
Missing: 81 / 1379 -> 5.5479452054794525% 


TA    1311
Gd      14
Po       3
Ex       3
Fa      48
Name: GarageQual, dtype: int64

------------------------------ 

Name: GarageCond	dtype: object
nunique: 5
Missing: 81 / 1379 -> 5.5479452054794525% 


TA    1326
Gd       9
Po       7
Ex       2
Fa      35
Name: GarageCond, dtype: int64

------------------------------ 

Name: PavedDrive	dtype: object
nunique: 3


P      30
Y    1340
N      90
Name: PavedDrive, dtype: int64

------------------------------ 

Name: PoolQC	dtype: object
nunique: 3
Missing: 1453 / 7 -> 99.52054794520548% 


Gd    3
Ex    2
Fa    2
Name: PoolQC, dtype: int64

------------------------------ 

Name: Fence	dtype: object
nunique: 4
Missing: 1179 / 281 -> 80.75342465753424% 


GdPrv     59
MnWw      11
GdWo      54
MnPrv    157
Name: Fence, dtype: int64

------------------------------ 

Name: MiscFeature	dtype: object
nunique: 4
Missing: 1406 / 54 -> 96.30136986301369% 


Shed    49
Othr     2
TenC     1
Gar2     2
Name: MiscFeature, dtype: int64

------------------------------ 

Name: SaleType	dtype: object
nunique: 9


Oth         3
New       122
ConLD       9
Con         2
WD       1267
COD        43
CWD         4
ConLI       5
ConLw       5
Name: SaleType, dtype: int64

------------------------------ 

Name: SaleCondition	dtype: object
nunique: 6


Family       20
Partial     125
Normal     1198
Alloca       12
AdjLand       4
Abnorml     101
Name: SaleCondition, dtype: int64

------------------------------ 



In [13]:
cat_col_names_without_na

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSSubClass     1460 non-null   category      
 1   MSZoning       1460 non-null   object        
 2   LotFrontage    1201 non-null   float64       
 3   LotArea        1460 non-null   int64         
 4   Street         1460 non-null   object        
 5   Alley          91 non-null     object        
 6   LotShape       1460 non-null   object        
 7   LandContour    1460 non-null   object        
 8   Utilities      1460 non-null   object        
 9   LotConfig      1460 non-null   object        
 10  LandSlope      1460 non-null   object        
 11  Neighborhood   1460 non-null   object        
 12  Condition1     1460 non-null   object        
 13  Condition2     1460 non-null   object        
 14  BldgType       1460 non-null   object        
 15  HouseStyle     1460 n

In [15]:
cat_col_names_with_na

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [16]:
train.Alley.fillna('NA', inplace=True)
train.BsmtQual.fillna('NA', inplace=True)
train.BsmtCond.fillna('NA', inplace=True)
train.BsmtExposure.fillna('NA', inplace=True)
train.BsmtFinType1.fillna('NA', inplace=True)
train.BsmtFinType2.fillna('NA', inplace=True)
train.FireplaceQu.fillna('NA', inplace=True)
train.GarageType.fillna('NA', inplace=True)
train.GarageFinish.fillna('NA', inplace=True)
train.GarageQual.fillna('NA', inplace=True)
train.GarageCond.fillna('NA', inplace=True)
train.PoolQC.fillna('NA', inplace=True)
train.Fence.fillna('NA', inplace=True)
train.MiscFeature.fillna('NA', inplace=True)

In [17]:
train.MasVnrType.fillna(train.MasVnrType.mode()[0], inplace=True)

train.Electrical.fillna(train.Electrical.mode()[0], inplace=True)

### Data types

#### object -> categorical

In [19]:
train.MSZoning = train.MSZoning.astype(CategoricalDtype(categories=['A', 'C', 
                                                                    'FV', 'I',
                                                                    'RH','RL', 
                                                                    'RP', 'RM'], 
                                                        ordered=False))

train.MSSubClass = train.MSSubClass.astype(CategoricalDtype(categories=[20, 30, 
                                                                        40, 45, 
                                                                        50, 60, 
                                                                        70, 75, 
                                                                        80, 85, 
                                                                        90, 120, 
                                                                        150, 160,  
                                                                        180, 190], 
                                                            ordered=False))

train.Street = train.Street.astype("category")

train.Alley = train.Alley.astype("category")

train.LotShape = train.LotShape.astype("category")

train.LandContour = train.LandContour.astype("category")

train.Utilities = train.Utilities.astype(CategoricalDtype(categories=['ELO', 'NoSeWa', 
                                                                      'NoSewr', 'AllPub'], 
                                                          ordered=True))

train.LotConfig = train.LotConfig.astype("category")

train.LandSlope = train.LandSlope.astype("category")

train.Neighborhood = train.Neighborhood.astype("category")

train.Condition1 = train.Condition1.astype("category")

train.Condition2 = train.Condition2.astype("category")

train.BldgType = train.BldgType.astype("category")

train.HouseStyle = train.HouseStyle.astype("category")

train.RoofStyle = train.RoofStyle.astype("category")

train.RoofMatl = train.RoofMatl.astype("category")

Exterior_cat = CategoricalDtype(categories=
                                train.Exterior1st.unique().tolist() + 
                                ['Other', 'PreCast'], 
                                ordered=False)
train.Exterior1st = train.Exterior1st.astype(Exterior_cat)
train.Exterior2nd = train.Exterior2nd.astype(Exterior_cat)

MasVnrType_cat = CategoricalDtype(categories=
                                train.MasVnrType.unique().tolist() + 
                                ['CBlock'], 
                                ordered=False)
train.MasVnrType = train.MasVnrType.astype(MasVnrType_cat)

Exter_cat = CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], 
                                ordered=True)
train.ExterQual = train.ExterQual.astype(Exter_cat)
train.ExterCond = train.ExterQual.astype(Exter_cat)
train.HeatingQC = train.HeatingQC.astype(Exter_cat)
train.KitchenQual = train.KitchenQual.astype(Exter_cat)


# TODO: what is the best - no or something?
Bsmt_car = CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
                            ordered=True)
train.BsmtQual = train.BsmtQual.astype(Bsmt_car)
train.BsmtCond = train.BsmtCond.astype(Bsmt_car)
train.FireplaceQu = train.FireplaceQu.astype(Bsmt_car)
train.GarageQual = train.GarageQual.astype(Bsmt_car)
train.GarageCond = train.GarageCond.astype(Bsmt_car)
train.PoolQC = train.PoolQC.astype(Bsmt_car)


train.Foundation = train.Foundation.astype("category")

# train. = train..astype("category")

# train. = train..astype("category")

# train. = train..astype("category")

# train. = train..astype("category")

# train. = train..astype("category")

# train. = train..astype("category")

# train. = train..astype("category")

# train. = train..astype("category")

# train. = train..astype("category")


In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSSubClass     1460 non-null   category      
 1   MSZoning       1450 non-null   category      
 2   LotFrontage    1201 non-null   float64       
 3   LotArea        1460 non-null   int64         
 4   Street         1460 non-null   category      
 5   Alley          1460 non-null   category      
 6   LotShape       1460 non-null   category      
 7   LandContour    1460 non-null   category      
 8   Utilities      1460 non-null   category      
 9   LotConfig      1460 non-null   category      
 10  LandSlope      1460 non-null   category      
 11  Neighborhood   1460 non-null   category      
 12  Condition1     1460 non-null   category      
 13  Condition2     1460 non-null   category      
 14  BldgType       1460 non-null   category      
 15  HouseStyle     1460 n