# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

## Импорт пакетов

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Загрузка данных

In [2]:
hpc_train = pd.read_csv("./../../../data/hpc_train.csv")
hpc_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
hpc_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
y = hpc_train.SalePrice
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [5]:
hpc_train.drop(columns=['SalePrice', 'Id'], inplace=True)
hpc_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


In [6]:
categorical_features = hpc_train.select_dtypes(include=['object']).columns
list(categorical_features)

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [7]:
numeric_features = hpc_train.select_dtypes(include=['int64', 'float64']).columns
list(numeric_features)

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [8]:
ordered_feature_mapping = {
    ('BsmtQual', 
     'BsmtCond', 
     'FireplaceQu',
     'GarageQual',
     'GarageCond',
     'ExterQual',
     'ExterCond',
     'HeatingQC',
     'KitchenQual'): {'Ex': 5, 
                      'Gd': 4, 
                      'TA': 3, 
                      'Fa': 2, 
                      'Po': 1, 
                      np.nan: 0},
    
    ('BsmtExposure', ): {'Gd': 4, 
                     'Av': 3, 
                     'Mn': 2, 
                     'No': 1, 
                     np.nan: 0},
    
    ('BsmtFinType1', 
     'BsmtFinType2'): {'GLQ': 6,
                       'ALQ': 5,
                       'BLQ': 4, 
                       'Rec': 3, 
                       'LwQ': 2, 
                       'Unf': 1, 
                       np.nan: 0},
    
    ('Electrical', ): {'SBrkr': 4, 
                   'FuseA': 3, 
                   'FuseF': 2, 
                   'FuseP': 1, 
                   'Mix': 0,
                   np.nan: 4},
    
    ('PoolQC', ): {'Ex': 4, 
               'Gd': 3, 
               'TA': 2, 
               'Fa': 1, 
               np.nan: 0},
    
    ('Fence', ): {'GdPrv': 4,
              'MnPrv': 3, 
              'GdWo': 2, 
              'MnWw': 1, 
              np.nan: 0},
    
    ('GarageFinish', ): {'Fin': 3,
                     'RFn': 2,
                     'Unf': 1,
                     np.nan: 0},
    
    ('Utilities', ): {'AllPub': 3,
                  'NoSewr': 2,
                  'NoSeWa': 1,
                  'ELO': 0},
    
    ('Functional', ): {'Typ': 7,
                   'Min1': 6,
                   'Min2': 5,
                   'Mod': 4,
                   'Maj1': 3, 
                   'Maj2': 2, 
                   'Sev': 1, 
                   'Sal': 0},
    
    ('GarageType', ): {'2Types': 6,
                   'Attchd': 5,
                   'Basment': 4, 
                   'BuiltIn': 3, 
                   'CarPort': 2, 
                   'Detchd': 1, 
                   np.nan: 0},
    
    ('PavedDrive', ): {'Y': 2,
                   'P': 1,
                   'N': 0}
}

ordered_feature_mapping

{('BsmtQual',
  'BsmtCond',
  'FireplaceQu',
  'GarageQual',
  'GarageCond',
  'ExterQual',
  'ExterCond',
  'HeatingQC',
  'KitchenQual'): {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0},
 ('BsmtExposure',): {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, nan: 0},
 ('BsmtFinType1', 'BsmtFinType2'): {'GLQ': 6,
  'ALQ': 5,
  'BLQ': 4,
  'Rec': 3,
  'LwQ': 2,
  'Unf': 1,
  nan: 0},
 ('Electrical',): {'SBrkr': 4,
  'FuseA': 3,
  'FuseF': 2,
  'FuseP': 1,
  'Mix': 0,
  nan: 4},
 ('PoolQC',): {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, nan: 0},
 ('Fence',): {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, nan: 0},
 ('GarageFinish',): {'Fin': 3, 'RFn': 2, 'Unf': 1, nan: 0},
 ('Utilities',): {'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1, 'ELO': 0},
 ('Functional',): {'Typ': 7,
  'Min1': 6,
  'Min2': 5,
  'Mod': 4,
  'Maj1': 3,
  'Maj2': 2,
  'Sev': 1,
  'Sal': 0},
 ('GarageType',): {'2Types': 6,
  'Attchd': 5,
  'Basment': 4,
  'BuiltIn': 3,
  'CarPort': 2,
  'Detchd': 1,
  nan: 0},
 ('PavedDrive',): {'Y': 2, 'P'

In [9]:
ord_mapping = []

for feat_names, mapping in ordered_feature_mapping.items():
    for feat_name in feat_names:
        ord_mapping.append({'col': feat_name, 'mapping': mapping})

ord_mapping

[{'col': 'BsmtQual',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'BsmtCond',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'FireplaceQu',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'GarageQual',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'GarageCond',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'ExterQual',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'ExterCond',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'HeatingQC',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'KitchenQual',
  'mapping': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, nan: 0}},
 {'col': 'BsmtExposure',
  'mapping': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, nan: 0}},
 {'col': 'BsmtFinType1',
  'mapping': {'GLQ': 6,
   'ALQ': 5,
   'BLQ': 4,
   'Rec': 3,
   '

In [10]:
ord_encoder = OrdinalEncoder(mapping=ord_mapping, return_df=True)
hpc_train = ord_encoder.fit_transform(hpc_train)
hpc_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,3,Inside,...,0,0,0,0,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,3,FR2,...,0,0,0,0,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,3,Inside,...,0,0,0,0,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,3,Corner,...,0,0,0,0,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,3,FR2,...,0,0,0,0,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,3,Inside,...,0,0,0,0,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,3,Inside,...,0,0,0,3,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,3,Inside,...,0,0,0,4,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,3,Inside,...,0,0,0,0,,0,4,2010,WD,Normal


In [11]:
# Категориальные признаки (порядковые)

# mapping_ex_gd_ta_fa_po_no = {'Ex': 5, 
#                              'Gd': 4, 
#                              'TA': 3, 
#                              'Fa': 2, 
#                              'Po': 1, 
#                              np.nan: 0}
# columns_ordered_ex_gd_ta_fa_po_no = ['BsmtQual', 
#                                      'BsmtCond', 
#                                      'FireplaceQu',
#                                      'GarageQual',
#                                      'GarageCond',
#                                      'ExterQual',
#                                      'ExterCond',
#                                      'HeatingQC',
#                                      'KitchenQual'
#                                     ]


# mapping_gd_av_mn_no_na = {'Gd': 4, 
#                           'Av': 3, 
#                           'Mn': 2, 
#                           'No': 1, 
#                           np.nan: 0}
# columns_ordered_gd_av_mn_no_na = ['BsmtExposure']


# mapping_glq_alq_blq_rec_lwq_unf_na  = {'GLQ': 6,
#                                        'ALQ': 5,
#                                        'BLQ': 4, 
#                                        'Rec': 3 , 
#                                        'LwQ': 2, 
#                                        'Unf': 1, 
#                                        np.nan: 0}
# columns_ordered_glq_alq_blq_rec_lwq_unf_na = ['BsmtFinType1', 
#                                               'BsmtFinType2']

# mapping_electrical = {'SBrkr': 4, 
#                       'FuseA': 3, 
#                       'FuseF': 2, 
#                       'FuseP': 1, 
#                       'Mix': 0,
#                      np.nan: 4}
# columns_ordered_electrical = ['Electrical']

# mapping_ex_gd_ta_fa_no = {'Ex': 4, 
#                           'Gd': 3, 
#                           'TA': 2, 
#                           'Fa': 1, 
#                           np.nan: 0}
# columns_ordered_ex_gd_ta_fa_no = ['PoolQC']

# mapping_gdprv_mnprv_gdwo_mnww_na = {'GdPrv': 4, 
#                                     'MnPrv': 3, 
#                                     'GdWo': 2, 
#                                     'MnWw': 1, 
#                                     np.nan: 0}
# columns_ordered_gdprv_mnprv_gdwo_mnww_na = ['Fence']

# mapping_garage_finish = {'Fin': 3,
#                          'RFn': 2,
#                          'Unf': 1,
#                          np.nan: 0}
# columns_ordered_garage_finish = ['GarageFinish']

# mapping_utils = {'AllPub': 3,
#                  'NoSewr': 2,
#                  'NoSeWa': 1,
#                  'ELO': 0}
# columns_ordered_utils = ['Utilities']

# mapping_functional = {'Typ': 7,
#                       'Min1': 6,
#                       'Min2': 5,
#                       'Mod': 4,
#                       'Maj1': 3, 
#                       'Maj2': 2, 
#                       'Sev': 1, 
#                       'Sal': 0}
# columns_ordered_functional = ['Functional']

# mapping_garage_type = {'2Types': 6,
#                        'Attchd': 5,
#                        'Basment': 4, 
#                        'BuiltIn': 3, 
#                        'CarPort': 2, 
#                        'Detchd': 1, 
#                        np.nan: 0}
# columns_ordered_garage_type = ['GarageType']

# mapping_paved_drive = {'Y': 2,
#                       'P': 1,
#                       'N': 0}
# columns_ordered_paved_drive = ['PavedDrive']

# ord_mapping = [
#     {'col': 'BsmtQual', 'mapping': mapping_ex_gd_ta_fa_po_no},
#     {'col': 'BsmtCond', 'mapping': mapping_ex_gd_ta_fa_po_no}, 
#     {'col': 'FireplaceQu', 'mapping': mapping_ex_gd_ta_fa_po_no},
#     {'col': 'GarageQual', 'mapping': mapping_ex_gd_ta_fa_po_no},
#     {'col': 'GarageCond', 'mapping': mapping_ex_gd_ta_fa_po_no},
#     {'col': 'ExterQual', 'mapping': mapping_ex_gd_ta_fa_po_no},
#     {'col': 'ExterCond', 'mapping': mapping_ex_gd_ta_fa_po_no},
#     {'col': 'HeatingQC', 'mapping': mapping_ex_gd_ta_fa_po_no},
#     {'col': 'KitchenQual', 'mapping': mapping_ex_gd_ta_fa_po_no},

#     {'col': 'BsmtExposure', 'mapping': mapping_gd_av_mn_no_na},
    
#     {'col': 'BsmtFinType1', 'mapping': mapping_glq_alq_blq_rec_lwq_unf_na},
#     {'col': 'BsmtFinType2', 'mapping': mapping_glq_alq_blq_rec_lwq_unf_na},
  
#     {'col': 'Electrical', 'mapping': mapping_electrical},
    
#     {'col': 'PoolQC', 'mapping': mapping_ex_gd_ta_fa_no},
    
#     {'col': 'Fence', 'mapping': mapping_gdprv_mnprv_gdwo_mnww_na},
    
#     {'col': 'GarageFinish', 'mapping': mapping_garage_finish},
    
#     {'col': 'Utilities', 'mapping': mapping_utils},
    
#     {'col': 'Functional', 'mapping': mapping_functional},
    
#     {'col': 'GarageType', 'mapping': mapping_garage_type},
    
#     {'col': 'PavedDrive', 'mapping': mapping_paved_drive}
# ]

# ord_encoder = OrdinalEncoder(mapping=ord_mapping, 
#                                  return_df=True)

# ordered_columns = ['BsmtQual', 
#                    'BsmtCond', 
#                    'FireplaceQu', 
#                    'GarageCond', 
#                    'BsmtExposure',
#                    'BsmtFinType1',
#                    'BsmtFinType2',
#                    'Electrical',
#                    'PoolQC',
#                    'Fence',
#                    'GarageFinish'
#                   ]


In [12]:
cols_with_missing = [col for col in hpc_train.columns
                     if hpc_train[col].isnull().any()]
cols_with_missing

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'GarageYrBlt',
 'MiscFeature']

In [13]:
s_imputer_empty = SimpleImputer(missing_values=np.nan, 
                               strategy='constant', 
                               fill_value='Empty')
cat_columns_with_nan = ['Alley', 
                        'MasVnrType',
                        'MiscFeature']
df_with_nan = hpc_train[cat_columns_with_nan]

s_imputer_none = s_imputer_empty.fit(df_with_nan)

imputed_hpc_train = pd.DataFrame(s_imputer_empty.transform(df_with_nan))
imputed_hpc_train.columns = cat_columns_with_nan
imputed_hpc_train

Unnamed: 0,Alley,MasVnrType,MiscFeature
0,Empty,BrkFace,Empty
1,Empty,,Empty
2,Empty,BrkFace,Empty
3,Empty,,Empty
4,Empty,BrkFace,Empty
...,...,...,...
1455,Empty,,Empty
1456,Empty,Stone,Empty
1457,Empty,,Shed
1458,Empty,,Empty


In [14]:
hpc_train[cat_columns_with_nan] = imputed_hpc_train
hpc_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,0,Empty,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Empty,Reg,Lvl,3,FR2,...,0,0,0,0,Empty,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Empty,IR1,Lvl,3,Inside,...,0,0,0,0,Empty,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Empty,IR1,Lvl,3,Corner,...,0,0,0,0,Empty,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Empty,IR1,Lvl,3,FR2,...,0,0,0,0,Empty,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,0,Empty,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,3,Empty,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,4,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,0,Empty,0,4,2010,WD,Normal


In [15]:
cols_with_missing = [col for col in hpc_train.columns
                     if hpc_train[col].isnull().any()]
cols_with_missing

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [16]:
s_imputer_zero = SimpleImputer(missing_values=np.nan, 
                               strategy='constant', 
                               fill_value=0.0)

num_columns_with_nan = ['LotFrontage', 
                        'MasVnrArea',
                        'GarageYrBlt']
df_num_with_nan = hpc_train[num_columns_with_nan]

s_imputer_none = s_imputer_zero.fit(df_num_with_nan)

imputed_hpc_train = pd.DataFrame(s_imputer_zero.transform(df_num_with_nan))
imputed_hpc_train.columns = num_columns_with_nan
imputed_hpc_train

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt
0,65.0,196.0,2003.0
1,80.0,0.0,1976.0
2,68.0,162.0,2001.0
3,60.0,0.0,1998.0
4,84.0,350.0,2000.0
...,...,...,...
1455,62.0,0.0,1999.0
1456,85.0,119.0,1978.0
1457,66.0,0.0,1941.0
1458,68.0,0.0,1950.0


In [17]:
hpc_train[num_columns_with_nan] = imputed_hpc_train
hpc_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,0,Empty,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Empty,Reg,Lvl,3,FR2,...,0,0,0,0,Empty,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Empty,IR1,Lvl,3,Inside,...,0,0,0,0,Empty,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Empty,IR1,Lvl,3,Corner,...,0,0,0,0,Empty,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Empty,IR1,Lvl,3,FR2,...,0,0,0,0,Empty,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,0,Empty,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,3,Empty,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,4,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Empty,Reg,Lvl,3,Inside,...,0,0,0,0,Empty,0,4,2010,WD,Normal


In [18]:
cols_with_missing = [col for col in hpc_train.columns
                     if hpc_train[col].isnull().any()]
cols_with_missing

[]

In [19]:
hpc_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   int64  
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [20]:
# Категориальные признаки (номинальные)
nominal_cat_features = list(hpc_train.select_dtypes(include=['object']).columns)
nominal_cat_features

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'Heating',
 'CentralAir',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [21]:
# Категориальные признаки (номинальные)
# nominal_cat = ['Alley', 
#                'MasVnrType', 
#                'MiscFeature',
#                'MSZoning',
#                'Street',
#                'LotShape',   # ???
#                'LandContour',
#                'LotConfig',
#                'LandSlope',   # ???
#                'Neighborhood',   # ???
#                'Condition1',
#                'Condition2',
#                'BldgType',   # ???
#                'HouseStyle',
#                'RoofStyle',
#                'RoofMatl',
#                'Exterior1st',
#                'Exterior2nd',
#                'Foundation',
#                'Heating',
#                'CentralAir',   # ???
#                'SaleType',
#                'SaleCondition'   # ???
#               ]
# len(nominal_cat)

In [22]:
oh_encoder = OneHotEncoder(cols=nominal_cat_features, 
                           use_cat_names=True, 
                           return_df=True)
nominal_cat_encoded = oh_encoder.fit_transform(hpc_train[nominal_cat_features])
nominal_cat_encoded


  elif pd.api.types.is_categorical(cols):


Unnamed: 0,MSZoning_RL,MSZoning_RM,MSZoning_C (all),MSZoning_FV,MSZoning_RH,Street_Pave,Street_Grvl,Alley_Empty,Alley_Grvl,Alley_Pave,...,SaleType_CWD,SaleType_ConLw,SaleType_Con,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_Partial,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family
0,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1456,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1457,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1458,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [23]:
hpc_train.drop(columns=nominal_cat_features, inplace=True)
hpc_train.join(nominal_cat_encoded)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Utilities,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,SaleType_CWD,SaleType_ConLw,SaleType_Con,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_Partial,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family
0,60,65.0,8450,3,7,5,2003,2003,196.0,4,...,0,0,0,0,1,0,0,0,0,0
1,20,80.0,9600,3,6,8,1976,1976,0.0,3,...,0,0,0,0,1,0,0,0,0,0
2,60,68.0,11250,3,7,5,2001,2002,162.0,4,...,0,0,0,0,1,0,0,0,0,0
3,70,60.0,9550,3,7,5,1915,1970,0.0,3,...,0,0,0,0,0,1,0,0,0,0
4,60,84.0,14260,3,8,5,2000,2000,350.0,4,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,3,6,5,1999,2000,0.0,3,...,0,0,0,0,1,0,0,0,0,0
1456,20,85.0,13175,3,6,6,1978,1988,119.0,3,...,0,0,0,0,1,0,0,0,0,0
1457,70,66.0,9042,3,7,9,1941,2006,0.0,5,...,0,0,0,0,1,0,0,0,0,0
1458,20,68.0,9717,3,5,6,1950,1996,0.0,3,...,0,0,0,0,1,0,0,0,0,0


In [24]:
hpc_train.select_dtypes(include=['object']).columns

Index([], dtype='object')