# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

## Импорт пакетов

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Загрузка данных

In [16]:
hpc_train = pd.read_csv("./../../../data/hpc_train.csv")
hpc_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [17]:
hpc_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [18]:
y = hpc_train.SalePrice
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [19]:
hpc_train.drop(columns=['SalePrice', 'Id'], inplace=True)
hpc_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


In [20]:
hpc_train[['Electrical']].value_counts()

Electrical
SBrkr         1334
FuseA           94
FuseF           27
FuseP            3
Mix              1
dtype: int64

In [22]:
mapping_ex_gd_ta_fa_po_no = {'Ex': 5, 
                             'Gd': 4, 
                             'TA': 3, 
                             'Fa': 2, 
                             'Po': 1, 
                             np.nan: 0}
#                              'NA': 0}
columns_ordered_ex_gd_ta_fa_po_no = ['BsmtQual', 
                                     'BsmtCond', 
                                     'FireplaceQu',
                                     'GarageCond']


mapping_gd_av_mn_no_na = {'Gd': 4, 
                          'Av': 3, 
                          'Mn': 2, 
                          'No': 1, 
                          np.nan: 0}
#                           'NA': 0}
columns_ordered_gd_av_mn_no_na = ['BsmtExposure']


mapping_glq_alq_blq_rec_lwq_unf_na  = {'GLQ': 6,
                                       'ALQ': 5,
                                       'BLQ': 4, 
                                       'Rec': 3 , 
                                       'LwQ': 2, 
                                       'Unf': 1, 
                                       np.nan: 0}
#                                        'NA': 0}
columns_ordered_glq_alq_blq_rec_lwq_unf_na = ['BsmtFinType1', 
                                              'BsmtFinType2']

mapping_electrical = {'SBrkr': 4, 
                      'FuseA': 3, 
                      'FuseF': 2, 
                      'FuseP': 1, 
                      'Mix': 0,
                     np.nan: 4}
columns_ordered_electrical = ['Electrical']

mapping_ex_gd_ta_fa_no = {'Ex': 4, 
                          'Gd': 3, 
                          'TA': 2, 
                          'Fa': 1, 
                          np.nan: 0}
#                           'NA': 0}
columns_ordered_ex_gd_ta_fa_no = ['PoolQC']

mapping_gdprv_mnprv_gdwo_mnww_na = {'GdPrv': 4, 
                                    'MnPrv': 3, 
                                    'GdWo': 2, 
                                    'MnWw': 1, 
                                    np.nan: 0}
#                                    'NA': 0}
columns_ordered_gdprv_mnprv_gdwo_mnww_na = ['Fence']

mapping_garage_finish = {'Fin': 3,
                         'RFn': 2,
                         'Unf': 1,
                         np.nan: 0}
columns_ordered_garage_finish = ['GarageFinish']

ord_mapping = [
    {'col': 'BsmtQual', 'mapping': mapping_ex_gd_ta_fa_po_no},
    {'col': 'BsmtCond', 'mapping': mapping_ex_gd_ta_fa_po_no}, 
    {'col': 'FireplaceQu', 'mapping': mapping_ex_gd_ta_fa_po_no},
    {'col': 'GarageCond', 'mapping': mapping_ex_gd_ta_fa_po_no},
    {'col': 'BsmtExposure', 'mapping': mapping_gd_av_mn_no_na},
    {'col': 'BsmtFinType1', 'mapping': mapping_glq_alq_blq_rec_lwq_unf_na},
    {'col': 'BsmtFinType2', 'mapping': mapping_glq_alq_blq_rec_lwq_unf_na},
    {'col': 'Electrical', 'mapping': mapping_electrical},
    {'col': 'PoolQC', 'mapping': mapping_ex_gd_ta_fa_no},
    {'col': 'Fence', 'mapping': mapping_gdprv_mnprv_gdwo_mnww_na},
    {'col': 'GarageFinish', 'mapping': mapping_garage_finish}
    
]

ord_encoder = OrdinalEncoder(mapping=ord_mapping, 
                                 return_df=True)

ordered_columns = ['BsmtQual', 
                   'BsmtCond', 
                   'FireplaceQu', 
                   'GarageCond', 
                   'BsmtExposure',
                   'BsmtFinType1',
                   'BsmtFinType2',
                   'Electrical',
                   'PoolQC',
                   'Fence',
                   'GarageFinish'
                  ]
hpc_train = ord_encoder.fit_transform(hpc_train)
hpc_train
# hpc_train[ordered_columns] 
# hpc_train[ordered_columns]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,0,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,0,0,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,0,0,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,0,0,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,0,0,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,0,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,3,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,4,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,0,,0,4,2010,WD,Normal


In [23]:
hpc_train[['Electrical']].value_counts()

Electrical
4             1335
3               94
2               27
1                3
0                1
dtype: int64

In [8]:
hpc_train[columns_ordered_ex_gd_ta_fa_po_no]

Unnamed: 0,BsmtQual,BsmtCond,FireplaceQu,GarageCond
0,4,3,0,3
1,4,3,3,3
2,4,3,3,3
3,3,4,4,3
4,4,3,3,3
...,...,...,...,...
1455,4,3,3,3
1456,4,3,3,3
1457,3,4,4,3
1458,3,3,0,3


In [9]:
s_imputer_empty = SimpleImputer(missing_values=np.nan, 
                               strategy='constant', 
                               fill_value='Empty')
df_with_nan = hpc_train[['Alley', 
                         'MasVnrType',
                         'GarageType', 
                         'MiscFeature']]

s_imputer_none = s_imputer_empty.fit(df_with_nan)

imputed_hpc_train = pd.DataFrame(s_imputer_empty.transform(df_with_nan))
imputed_hpc_train.columns = ['Alley', 'MasVnrType', 'GarageType', 'MiscFeature']
imputed_hpc_train

Unnamed: 0,Alley,MasVnrType,GarageType,MiscFeature
0,Empty,BrkFace,Attchd,Empty
1,Empty,,Attchd,Empty
2,Empty,BrkFace,Attchd,Empty
3,Empty,,Detchd,Empty
4,Empty,BrkFace,Attchd,Empty
...,...,...,...,...
1455,Empty,,Attchd,Empty
1456,Empty,Stone,Attchd,Empty
1457,Empty,,Attchd,Shed
1458,Empty,,Attchd,Empty


In [10]:
oh_encoder = OneHotEncoder(cols=['Alley', 'MasVnrType', 'GarageType', 'MiscFeature'], 
                 use_cat_names=True, 
                 return_df=True)
df = oh_encoder.fit_transform(imputed_hpc_train)
df

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Alley_Empty,Alley_Grvl,Alley_Pave,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,MasVnrType_BrkCmn,MasVnrType_Empty,GarageType_Attchd,GarageType_Detchd,GarageType_BuiltIn,GarageType_CarPort,GarageType_Empty,GarageType_Basment,GarageType_2Types,MiscFeature_Empty,MiscFeature_Shed,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_TenC
0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
2,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1456,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1457,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
1458,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0


In [11]:
s_imputer_zero = SimpleImputer(missing_values=np.nan, 
                               strategy='constant', 
                               fill_value=0.0)

num_columns_with_nan = ['LotFrontage', 
                        'MasVnrArea',
                        'GarageYrBlt']
df_num_with_nan = hpc_train[num_columns_with_nan]

s_imputer_none = s_imputer_zero.fit(df_num_with_nan)

imputed_hpc_train = pd.DataFrame(s_imputer_zero.transform(df_num_with_nan))
imputed_hpc_train.columns = num_columns_with_nan
imputed_hpc_train

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt
0,65.0,196.0,2003.0
1,80.0,0.0,1976.0
2,68.0,162.0,2001.0
3,60.0,0.0,1998.0
4,84.0,350.0,2000.0
...,...,...,...
1455,62.0,0.0,1999.0
1456,85.0,119.0,1978.0
1457,66.0,0.0,1941.0
1458,68.0,0.0,1950.0


In [12]:
numeric_features = hpc_train.select_dtypes(include=['int64', 'float64']).columns
numeric_features

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

In [13]:
categorical_features = hpc_train.select_dtypes(include=['object']).columns
categorical_features

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional',
       'GarageType', 'GarageQual', 'PavedDrive', 'MiscFeature', 'SaleType',
       'SaleCondition'],
      dtype='object')