# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

## Install necessary packages

In [1]:
# !conda install numpy pandas matplotlib seaborn missingno scikit-learn pandas_profiling -y

In [2]:
# !conda install -c conda-forge xgboost -y

## Import necessary packages

In [3]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

## Read in data

In [4]:
train_file_path = "./../../data/hpc_train.csv"
test_file_path = "./../../data/hpc_test.csv"

In [5]:
pd.options.display.max_columns = None

In [6]:
train = pd.read_csv(train_file_path, 
                    index_col='Id')
train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


In [7]:
test = pd.read_csv(test_file_path, 
                   index_col='Id')

In [8]:
train.MSSubClass = train.MSSubClass.astype('category')

# TODO: ordeted category
train.OverallQual = train.OverallQual.astype('category')

train.OverallCond = train.OverallCond.astype('category')




In [9]:
train.YearBuilt = pd.to_datetime(train.YearBuilt, format='%Y')

train.YearRemodAdd = pd.to_datetime(train.YearRemodAdd, format='%Y')

train.GarageYrBlt = pd.to_datetime(train.GarageYrBlt, format='%Y')

# TODO: MoSold
train.YrSold = pd.to_datetime(train.YrSold, format='%Y')

## Exploratory Data Analysis


### Missing values

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSSubClass     1460 non-null   category      
 1   MSZoning       1460 non-null   object        
 2   LotFrontage    1201 non-null   float64       
 3   LotArea        1460 non-null   int64         
 4   Street         1460 non-null   object        
 5   Alley          91 non-null     object        
 6   LotShape       1460 non-null   object        
 7   LandContour    1460 non-null   object        
 8   Utilities      1460 non-null   object        
 9   LotConfig      1460 non-null   object        
 10  LandSlope      1460 non-null   object        
 11  Neighborhood   1460 non-null   object        
 12  Condition1     1460 non-null   object        
 13  Condition2     1460 non-null   object        
 14  BldgType       1460 non-null   object        
 15  HouseStyle     1460 n

In [11]:
cat_col_names_with_na = []
cat_col_names_without_na = []
for col_name in train.select_dtypes(exclude=['number']):
    print(f"Name: {col_name}\tdtype: {train[col_name].dtype}")
    print(f"nunique: {train[col_name].nunique()}")
    missing_count = train[col_name].isnull().sum()
    count = train[col_name].count()
    if missing_count:
        cat_col_names_with_na.append(col_name)
        print(f"Missing: {missing_count} / {count} -> {missing_count/(missing_count + count)*100}% ")    
    else:
        cat_col_names_without_na.append(col_name)
    display(train[col_name].value_counts(sort=False))
    
    print('-' * 30, '\n')

Name: MSSubClass	dtype: category
nunique: 15


20     536
30      69
40       4
45      12
50     144
60     299
70      60
75      16
80      58
85      20
90      52
120     87
160     63
180     10
190     30
Name: MSSubClass, dtype: int64

------------------------------ 

Name: MSZoning	dtype: object
nunique: 5


RH           16
RL         1151
C (all)      10
FV           65
RM          218
Name: MSZoning, dtype: int64

------------------------------ 

Name: Street	dtype: object
nunique: 2


Pave    1454
Grvl       6
Name: Street, dtype: int64

------------------------------ 

Name: Alley	dtype: object
nunique: 2
Missing: 1369 / 91 -> 93.76712328767123% 


Pave    41
Grvl    50
Name: Alley, dtype: int64

------------------------------ 

Name: LotShape	dtype: object
nunique: 4


IR3     10
Reg    925
IR2     41
IR1    484
Name: LotShape, dtype: int64

------------------------------ 

Name: LandContour	dtype: object
nunique: 4


Low      36
Lvl    1311
Bnk      63
HLS      50
Name: LandContour, dtype: int64

------------------------------ 

Name: Utilities	dtype: object
nunique: 2


AllPub    1459
NoSeWa       1
Name: Utilities, dtype: int64

------------------------------ 

Name: LotConfig	dtype: object
nunique: 5


CulDSac      94
Corner      263
FR3           4
FR2          47
Inside     1052
Name: LotConfig, dtype: int64

------------------------------ 

Name: LandSlope	dtype: object
nunique: 3


Sev      13
Gtl    1382
Mod      65
Name: LandSlope, dtype: int64

------------------------------ 

Name: Neighborhood	dtype: object
nunique: 25


Blmngtn     17
NoRidge     41
BrDale      16
StoneBr     25
NAmes      225
Somerst     86
NWAmes      73
Gilbert     79
SawyerW     59
Veenker     11
CollgCr    150
Blueste      2
Edwards    100
BrkSide     58
Sawyer      74
MeadowV     17
IDOTRR      37
Mitchel     49
OldTown    113
Timber      38
ClearCr     28
SWISU       25
Crawfor     51
NridgHt     77
NPkVill      9
Name: Neighborhood, dtype: int64

------------------------------ 

Name: Condition1	dtype: object
nunique: 9


RRAe        11
Artery      48
Feedr       81
PosA         8
PosN        19
RRNe         2
Norm      1260
RRNn         5
RRAn        26
Name: Condition1, dtype: int64

------------------------------ 

Name: Condition2	dtype: object
nunique: 8


RRAe         1
Artery       2
Feedr        6
PosA         1
PosN         2
Norm      1445
RRNn         2
RRAn         1
Name: Condition2, dtype: int64

------------------------------ 

Name: BldgType	dtype: object
nunique: 5


Twnhs       43
1Fam      1220
TwnhsE     114
2fmCon      31
Duplex      52
Name: BldgType, dtype: int64

------------------------------ 

Name: HouseStyle	dtype: object
nunique: 8


1Story    726
SFoyer     37
1.5Fin    154
1.5Unf     14
2Story    445
SLvl       65
2.5Unf     11
2.5Fin      8
Name: HouseStyle, dtype: int64

------------------------------ 

Name: OverallQual	dtype: category
nunique: 10


1       2
2       3
3      20
4     116
5     397
6     374
7     319
8     168
9      43
10     18
Name: OverallQual, dtype: int64

------------------------------ 

Name: OverallCond	dtype: category
nunique: 9


1      1
2      5
3     25
4     57
5    821
6    252
7    205
8     72
9     22
Name: OverallCond, dtype: int64

------------------------------ 

Name: YearBuilt	dtype: datetime64[ns]
nunique: 112


1970-01-01    24
1965-01-01    24
1948-01-01    14
1938-01-01     4
2009-01-01    18
              ..
1997-01-01    14
1980-01-01    10
1921-01-01     6
1904-01-01     1
1975-01-01     8
Name: YearBuilt, Length: 112, dtype: int64

------------------------------ 

Name: YearRemodAdd	dtype: datetime64[ns]
nunique: 61


1970-01-01    26
1965-01-01    19
2009-01-01    23
1992-01-01    17
1987-01-01    10
              ..
2007-01-01    76
2002-01-01    48
1997-01-01    25
1980-01-01    12
1975-01-01    10
Name: YearRemodAdd, Length: 61, dtype: int64

------------------------------ 

Name: RoofStyle	dtype: object
nunique: 6


Flat         13
Gambrel      11
Shed          2
Gable      1141
Mansard       7
Hip         286
Name: RoofStyle, dtype: int64

------------------------------ 

Name: RoofMatl	dtype: object
nunique: 8


CompShg    1434
Membran       1
WdShake       5
Roll          1
ClyTile       1
Metal         1
WdShngl       6
Tar&Grv      11
Name: RoofMatl, dtype: int64

------------------------------ 

Name: Exterior1st	dtype: object
nunique: 15


BrkComm      2
BrkFace     50
WdShing     26
ImStucc      1
AsbShng     20
CBlock       1
AsphShn      1
MetalSd    220
Stone        2
Plywood    108
HdBoard    222
CemntBd     61
Stucco      25
VinylSd    515
Wd Sdng    206
Name: Exterior1st, dtype: int64

------------------------------ 

Name: Exterior2nd	dtype: object
nunique: 16


Other        1
BrkFace     25
CmentBd     60
ImStucc     10
AsbShng     20
CBlock       1
AsphShn      3
MetalSd    214
Stone        5
Plywood    142
Wd Shng     38
HdBoard    207
Stucco      26
Brk Cmn      7
VinylSd    504
Wd Sdng    197
Name: Exterior2nd, dtype: int64

------------------------------ 

Name: MasVnrType	dtype: object
nunique: 4
Missing: 8 / 1452 -> 0.547945205479452% 


BrkFace    445
Stone      128
None       864
BrkCmn      15
Name: MasVnrType, dtype: int64

------------------------------ 

Name: ExterQual	dtype: object
nunique: 4


Fa     14
Gd    488
TA    906
Ex     52
Name: ExterQual, dtype: int64

------------------------------ 

Name: ExterCond	dtype: object
nunique: 5


Po       1
Fa      28
Gd     146
TA    1282
Ex       3
Name: ExterCond, dtype: int64

------------------------------ 

Name: Foundation	dtype: object
nunique: 6


CBlock    634
Wood        3
Stone       6
BrkTil    146
Slab       24
PConc     647
Name: Foundation, dtype: int64

------------------------------ 

Name: BsmtQual	dtype: object
nunique: 4
Missing: 37 / 1423 -> 2.5342465753424657% 


Fa     35
Gd    618
TA    649
Ex    121
Name: BsmtQual, dtype: int64

------------------------------ 

Name: BsmtCond	dtype: object
nunique: 4
Missing: 37 / 1423 -> 2.5342465753424657% 


Po       2
Fa      45
Gd      65
TA    1311
Name: BsmtCond, dtype: int64

------------------------------ 

Name: BsmtExposure	dtype: object
nunique: 4
Missing: 38 / 1422 -> 2.6027397260273974% 


No    953
Gd    134
Mn    114
Av    221
Name: BsmtExposure, dtype: int64

------------------------------ 

Name: BsmtFinType1	dtype: object
nunique: 6
Missing: 37 / 1423 -> 2.5342465753424657% 


Unf    430
LwQ     74
Rec    133
ALQ    220
BLQ    148
GLQ    418
Name: BsmtFinType1, dtype: int64

------------------------------ 

Name: BsmtFinType2	dtype: object
nunique: 6
Missing: 38 / 1422 -> 2.6027397260273974% 


Unf    1256
LwQ      46
Rec      54
ALQ      19
BLQ      33
GLQ      14
Name: BsmtFinType2, dtype: int64

------------------------------ 

Name: Heating	dtype: object
nunique: 6


Floor       1
Wall        4
OthW        2
GasA     1428
Grav        7
GasW       18
Name: Heating, dtype: int64

------------------------------ 

Name: HeatingQC	dtype: object
nunique: 5


Po      1
Fa     49
Gd    241
TA    428
Ex    741
Name: HeatingQC, dtype: int64

------------------------------ 

Name: CentralAir	dtype: object
nunique: 2


N      95
Y    1365
Name: CentralAir, dtype: int64

------------------------------ 

Name: Electrical	dtype: object
nunique: 5
Missing: 1 / 1459 -> 0.0684931506849315% 


Mix         1
FuseA      94
FuseF      27
SBrkr    1334
FuseP       3
Name: Electrical, dtype: int64

------------------------------ 

Name: KitchenQual	dtype: object
nunique: 4


Fa     39
Gd    586
TA    735
Ex    100
Name: KitchenQual, dtype: int64

------------------------------ 

Name: Functional	dtype: object
nunique: 7


Sev        1
Min1      31
Maj2       5
Typ     1360
Maj1      14
Min2      34
Mod       15
Name: Functional, dtype: int64

------------------------------ 

Name: FireplaceQu	dtype: object
nunique: 5
Missing: 690 / 770 -> 47.26027397260274% 


Po     20
Fa     33
Gd    380
TA    313
Ex     24
Name: FireplaceQu, dtype: int64

------------------------------ 

Name: GarageType	dtype: object
nunique: 6
Missing: 81 / 1379 -> 5.5479452054794525% 


2Types       6
Basment     19
CarPort      9
BuiltIn     88
Attchd     870
Detchd     387
Name: GarageType, dtype: int64

------------------------------ 

Name: GarageYrBlt	dtype: datetime64[ns]
nunique: 97
Missing: 81 / 1379 -> 5.5479452054794525% 


1970-01-01    20
1965-01-01    21
1948-01-01    11
1938-01-01     3
2009-01-01    21
              ..
1926-01-01     6
1997-01-01    19
1980-01-01    15
1921-01-01     3
1975-01-01     9
Name: GarageYrBlt, Length: 97, dtype: int64

------------------------------ 

Name: GarageFinish	dtype: object
nunique: 3
Missing: 81 / 1379 -> 5.5479452054794525% 


Unf    605
RFn    422
Fin    352
Name: GarageFinish, dtype: int64

------------------------------ 

Name: GarageQual	dtype: object
nunique: 5
Missing: 81 / 1379 -> 5.5479452054794525% 


Po       3
Fa      48
Gd      14
TA    1311
Ex       3
Name: GarageQual, dtype: int64

------------------------------ 

Name: GarageCond	dtype: object
nunique: 5
Missing: 81 / 1379 -> 5.5479452054794525% 


Po       7
Fa      35
Gd       9
TA    1326
Ex       2
Name: GarageCond, dtype: int64

------------------------------ 

Name: PavedDrive	dtype: object
nunique: 3


N      90
P      30
Y    1340
Name: PavedDrive, dtype: int64

------------------------------ 

Name: PoolQC	dtype: object
nunique: 3
Missing: 1453 / 7 -> 99.52054794520548% 


Fa    2
Gd    3
Ex    2
Name: PoolQC, dtype: int64

------------------------------ 

Name: Fence	dtype: object
nunique: 4
Missing: 1179 / 281 -> 80.75342465753424% 


MnWw      11
GdPrv     59
MnPrv    157
GdWo      54
Name: Fence, dtype: int64

------------------------------ 

Name: MiscFeature	dtype: object
nunique: 4
Missing: 1406 / 54 -> 96.30136986301369% 


Othr     2
Shed    49
Gar2     2
TenC     1
Name: MiscFeature, dtype: int64

------------------------------ 

Name: YrSold	dtype: datetime64[ns]
nunique: 5


2009-01-01    338
2006-01-01    314
2008-01-01    304
2010-01-01    175
2007-01-01    329
Name: YrSold, dtype: int64

------------------------------ 

Name: SaleType	dtype: object
nunique: 9


New       122
ConLI       5
Con         2
Oth         3
CWD         4
ConLw       5
ConLD       9
WD       1267
COD        43
Name: SaleType, dtype: int64

------------------------------ 

Name: SaleCondition	dtype: object
nunique: 6


Alloca       12
Partial     125
Normal     1198
AdjLand       4
Abnorml     101
Family       20
Name: SaleCondition, dtype: int64

------------------------------ 



In [12]:
cat_col_names_without_na

['MSSubClass',
 'MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'YrSold',
 'SaleType',
 'SaleCondition']

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSSubClass     1460 non-null   category      
 1   MSZoning       1460 non-null   object        
 2   LotFrontage    1201 non-null   float64       
 3   LotArea        1460 non-null   int64         
 4   Street         1460 non-null   object        
 5   Alley          91 non-null     object        
 6   LotShape       1460 non-null   object        
 7   LandContour    1460 non-null   object        
 8   Utilities      1460 non-null   object        
 9   LotConfig      1460 non-null   object        
 10  LandSlope      1460 non-null   object        
 11  Neighborhood   1460 non-null   object        
 12  Condition1     1460 non-null   object        
 13  Condition2     1460 non-null   object        
 14  BldgType       1460 non-null   object        
 15  HouseStyle     1460 n

In [14]:
cat_col_names_with_na

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [15]:
train.Alley.fillna('No alley', inplace=True)
train.MasVnrType.fillna('No', inplace=True)
train.BsmtQual.fillna('No Basement', inplace=True)
train.BsmtCond.fillna('No Basement', inplace=True)
train.BsmtExposure.fillna('No Basement', inplace=True)
train.BsmtFinType1.fillna('No Basement', inplace=True)
train.BsmtFinType2.fillna('No Basement', inplace=True)
train.FireplaceQu.fillna('No Fireplace', inplace=True)
train.GarageType.fillna('No Garage', inplace=True)
train.GarageFinish.fillna('No Garage', inplace=True)
train.GarageQual.fillna('No Garage', inplace=True)
train.GarageCond.fillna('No Garage', inplace=True)
train.PoolQC.fillna('No Pool', inplace=True)
train.Fence.fillna('No Fence', inplace=True)
train.MiscFeature.fillna('No', inplace=True)

In [16]:
train.Electrical.fillna(train.Electrical.mode()[0], inplace=True)

### Data types

#### object -> categorical

In [17]:
from pandas.api.types import CategoricalDtype

train.MSZoning = train.MSZoning.astype(CategoricalDtype(categories=['A', 'C', 
                                                                    'FV', 'I',
                                                                    'RH','RL', 
                                                                    'RP', 'RM'], 
                                                        ordered=False))

train.MSSubClass = train.MSSubClass.astype(CategoricalDtype(categories=[20, 30, 
                                                                        40, 45, 
                                                                        50, 60, 
                                                                        70, 75, 
                                                                        80, 85, 
                                                                        90, 120, 
                                                                        150, 160,  
                                                                        180, 190], 
                                                            ordered=False))

train.Street = train.Street.astype("category")

train.Alley = train.Alley.astype("category")

train.LotShape = train.LotShape.astype("category")

train.LandContour = train.LandContour.astype("category")

train.Utilities = train.Utilities.astype(CategoricalDtype(categories=['ELO', 'NoSeWa', 
                                                                      'NoSewr', 'AllPub'], 
                                                          ordered=True))

train.LotConfig = train.LotConfig.astype("category")

train.LandSlope = train.LandSlope.astype("category")

train.Neighborhood = train.Neighborhood.astype("category")

train.Condition1 = train.Condition1.astype("category")

train.Condition2 = train.Condition2.astype("category")

train.BldgType = train.BldgType.astype("category")

train.HouseStyle = train.HouseStyle.astype("category")

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

# train. = train..astype("category", 
#                                        categories=[], 
#                                        ordered=True)

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSSubClass     1460 non-null   category      
 1   MSZoning       1450 non-null   category      
 2   LotFrontage    1201 non-null   float64       
 3   LotArea        1460 non-null   int64         
 4   Street         1460 non-null   category      
 5   Alley          1460 non-null   category      
 6   LotShape       1460 non-null   category      
 7   LandContour    1460 non-null   category      
 8   Utilities      1460 non-null   category      
 9   LotConfig      1460 non-null   category      
 10  LandSlope      1460 non-null   category      
 11  Neighborhood   1460 non-null   category      
 12  Condition1     1460 non-null   category      
 13  Condition2     1460 non-null   category      
 14  BldgType       1460 non-null   category      
 15  HouseStyle     1460 n

In [19]:
train.describe(include='all')

  train.describe(include='all')
  train.describe(include='all')
  train.describe(include='all')
  train.describe(include='all')


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
count,1460.0,1450,1201.0,1460.0,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460.0,1460.0,1460,1460,1460,1460,1460,1460,1460.0,1452.0,1460,1460,1460,1460,1460,1460,1460,1460.0,1460,1460.0,1460.0,1460.0,1460,1460,1460,1460,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460,1460.0,1460,1460.0,1460,1460,1379,1460,1460.0,1460.0,1460,1460,1460,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460,1460,1460,1460.0,1460.0,1460,1460,1460,1460.0
unique,15.0,4,,,2,3,4,4,2,5,3,25,9,8,5,8,10.0,9.0,112,61,6,8,15,16,5.0,,4,5,6,5,5,5,7,,7,,,,6,5,2,5,,,,,,,,,,,4,,7,,6,7,97,4,,,6,6,3,,,,,,,4,5,5,,,5,9,6,
top,20.0,RL,,,Pave,No alley,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5.0,5.0,2006-01-01 00:00:00,1950-01-01 00:00:00,Gable,CompShg,VinylSd,VinylSd,,,TA,TA,PConc,TA,TA,No,Unf,,Unf,,,,GasA,Ex,Y,SBrkr,,,,,,,,,,,TA,,Typ,,No Fireplace,Attchd,2005-01-01 00:00:00,Unf,,,TA,TA,Y,,,,,,,No Pool,No Fence,No,,,2009-01-01 00:00:00,WD,Normal,
freq,536.0,1151,,,1454,1369,925,1311,1459,1052,1382,225,1260,1445,1220,726,397.0,821.0,67,178,1141,1434,515,504,864.0,,906,1282,647,649,1311,953,430,,1256,,,,1428,741,1365,1335,,,,,,,,,,,735,,1360,,690,870,65,605,,,1311,1326,1340,,,,,,,1453,1179,1406,,,338,1267,1198,
first,,,,,,,,,,,,,,,,,,,1872-01-01 00:00:00,1950-01-01 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1900-01-01 00:00:00,,,,,,,,,,,,,,,,,,2006-01-01 00:00:00,,,
last,,,,,,,,,,,,,,,,,,,2010-01-01 00:00:00,2010-01-01 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010-01-01 00:00:00,,,,,,,,,,,,,,,,,,2010-01-01 00:00:00,,,
mean,,,70.049958,10516.828082,,,,,,,,,,,,,,,,,,,,,,103.685262,,,,,,,,443.639726,,46.549315,567.240411,1057.429452,,,,,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,,6.517808,,0.613014,,,,,1.767123,472.980137,,,,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,,,,43.489041,6.321918,,,,180921.19589
std,,,24.284752,9981.264932,,,,,,,,,,,,,,,,,,,,,,181.066207,,,,,,,,456.098091,,161.319273,441.866955,438.705324,,,,,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,,1.625393,,0.644666,,,,,0.747315,213.804841,,,,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,,,,496.123024,2.703626,,,,79442.502883
min,,,21.0,1300.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,0.0,,0.0,0.0,0.0,,,,,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,,2.0,,0.0,,,,,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,1.0,,,,34900.0
25%,,,59.0,7553.5,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,0.0,,0.0,223.0,795.75,,,,,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,,5.0,,0.0,,,,,1.0,334.5,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,5.0,,,,129975.0
