# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course)

## Импорт пакетов

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Загрузка данных

In [2]:
hpc_train = pd.read_csv("./../../../data/hpc_train.csv")
hpc_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
hpc_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
y = hpc_train.SalePrice
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [5]:
hpc_train.drop(columns=['SalePrice', 'Id'], inplace=True)
hpc_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


In [6]:
X = hpc_train.copy()
X

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


---
## Pipeline

In [7]:
ordered_feature_mapping = {
    ('BsmtQual', 
     'BsmtCond', 
     'FireplaceQu',
     'GarageQual',
     'GarageCond',
     'ExterQual',
     'ExterCond',
     'HeatingQC',
     'KitchenQual'): {'Ex': 5, 
                      'Gd': 4, 
                      'TA': 3, 
                      'Fa': 2, 
                      'Po': 1, 
                      np.nan: 0},
    
    ('BsmtExposure', ): {'Gd': 4, 
                     'Av': 3, 
                     'Mn': 2, 
                     'No': 1, 
                     np.nan: 0},
    
    ('BsmtFinType1', 
     'BsmtFinType2'): {'GLQ': 6,
                       'ALQ': 5,
                       'BLQ': 4, 
                       'Rec': 3, 
                       'LwQ': 2, 
                       'Unf': 1, 
                       np.nan: 0},
    
    ('Electrical', ): {'SBrkr': 4, 
                   'FuseA': 3, 
                   'FuseF': 2, 
                   'FuseP': 1, 
                   'Mix': 0,
                   np.nan: 4},
    
    ('PoolQC', ): {'Ex': 4, 
               'Gd': 3, 
               'TA': 2, 
               'Fa': 1, 
               np.nan: 0},
    
    ('Fence', ): {'GdPrv': 4,
              'MnPrv': 3, 
              'GdWo': 2, 
              'MnWw': 1, 
              np.nan: 0},
    
    ('GarageFinish', ): {'Fin': 3,
                     'RFn': 2,
                     'Unf': 1,
                     np.nan: 0},
    
    ('Utilities', ): {'AllPub': 3,
                  'NoSewr': 2,
                  'NoSeWa': 1,
                  'ELO': 0},
    
    ('Functional', ): {'Typ': 7,
                   'Min1': 6,
                   'Min2': 5,
                   'Mod': 4,
                   'Maj1': 3, 
                   'Maj2': 2, 
                   'Sev': 1, 
                   'Sal': 0},
    
    ('GarageType', ): {'2Types': 6,
                   'Attchd': 5,
                   'Basment': 4, 
                   'BuiltIn': 3, 
                   'CarPort': 2, 
                   'Detchd': 1, 
                   np.nan: 0},
    
    ('PavedDrive', ): {'Y': 2,
                   'P': 1,
                   'N': 0}
}

In [8]:
ord_mapping = []
ord_features = []
for feat_names, mapping in ordered_feature_mapping.items():
    for feat_name in feat_names:
        ord_mapping.append({'col': feat_name, 'mapping': mapping})
        ord_features.append(feat_name)

In [9]:
# Числовые признаки с nan
num_features_with_nan = ['LotFrontage', 
                        'MasVnrArea',
#                         'GarageYrBlt'
                        ]

In [10]:
# Категориальные признаки (номинальные) с nan
obj_features_with_nan = ['Alley', 
                        'MasVnrType',
                        'MiscFeature'
                        ]

In [11]:
# Категориальные признаки (номинальные) без nan
obj_features_without_nan = [
               'MSZoning',
               'Street',
               'LotShape',   # ???
               'LandContour',
               'LotConfig',
               'LandSlope',   # ???
               'Neighborhood',   # ???
               'Condition1',
               'Condition2',
               'BldgType',   # ???
               'HouseStyle',
               'RoofStyle',
               'RoofMatl',
               'Exterior1st',
               'Exterior2nd',
               'Foundation',
               'Heating',
               'CentralAir',   # ???
               'SaleType',
               'SaleCondition'   # ???
              ]

In [21]:
obj_features = obj_features_with_nan + obj_features_without_nan + ord_features
obj_features

['Alley',
 'MasVnrType',
 'MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'Foundation',
 'Heating',
 'CentralAir',
 'SaleType',
 'SaleCondition',
 'BsmtQual',
 'BsmtCond',
 'FireplaceQu',
 'GarageQual',
 'GarageCond',
 'ExterQual',
 'ExterCond',
 'HeatingQC',
 'KitchenQual',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'PoolQC',
 'Fence',
 'GarageFinish',
 'Utilities',
 'Functional',
 'GarageType',
 'PavedDrive']

In [25]:
ord_enc_step = ('ord_enc', 
                OrdinalEncoder(mapping=ord_mapping))


obj_cat_imp_step = ('str_cat_imp', 
                    SimpleImputer(missing_values=np.nan, 
                                  strategy='constant', 
                                  fill_value='Empty'))

num_cat_imp_step = ('num_cat_imp', 
                    SimpleImputer(missing_values=np.nan, 
                                  strategy='constant', 
                                  fill_value=0.0))

oh_enc_step = ('oh_enc', OneHotEncoder())

ord_transformer = Pipeline(steps=[
    ord_enc_step
])

obj_cat_transformer = Pipeline(steps=[
    obj_cat_imp_step,
    oh_enc_step
])



num_cat_transformer = Pipeline(steps=[
    num_cat_imp_step
])

preprocessor = ColumnTransformer(
    transformers=[
#         ('ord_cat', ord_transformer, ord_features),
        ('num_cat', num_cat_transformer, num_features_with_nan),
        ('obj_cat', obj_cat_transformer, obj_features),
    ])


from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
#         ('regressor', XGBRegressor(learning_rate = 0.01, 
#                                    max_depth = 5,
#                                    n_estimators = 500
#                                   ))
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0)
X_train

pipeline.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_cat',
                                                  Pipeline(steps=[('num_cat_imp',
                                                                   SimpleImputer(fill_value=0.0,
                                                                                 strategy='constant'))]),
                                                  ['LotFrontage',
                                                   'MasVnrArea']),
                                                 ('obj_cat',
                                                  Pipeline(steps=[('str_cat_imp',
                                                                   SimpleImputer(fill_value='Empty',
                                                                                 strategy='constant')),
                                                                  ('oh_enc',
                                                               

In [26]:
pd.pandas.set_option('display.max_columns', None)
df = pd.DataFrame(preprocessor.transform(X_train))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Columns: 261 entries, 0 to 260
dtypes: float64(261)
memory usage: 2.3 MB


In [27]:
y_pred = pipeline.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred))

MAE: 25520.781490541423


In [16]:
hpc_test = pd.read_csv("./../../../data/hpc_test.csv")
test = hpc_test.drop(columns=['Id'])
test

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,7,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,546.0,546.0,GasA,Gd,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,5,Typ,0,,,,,0.0,0.0,,,Y,0,0,0,0,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,5,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,252.0,Unf,0.0,294.0,546.0,GasA,TA,Y,SBrkr,546,546,0,1092,0.0,0.0,1,1,3,1,TA,6,Typ,0,,CarPort,1970.0,Unf,1.0,286.0,TA,TA,Y,0,24,0,0,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,7,1960,1996,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,ALQ,1224.0,Unf,0.0,0.0,1224.0,GasA,Ex,Y,SBrkr,1224,0,0,1224,1.0,0.0,1,0,4,1,TA,7,Typ,1,TA,Detchd,1960.0,Unf,2.0,576.0,TA,TA,Y,474,0,0,0,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,5,5,1992,1992,Gable,CompShg,HdBoard,Wd Shng,,0.0,TA,TA,PConc,Gd,TA,Av,GLQ,337.0,Unf,0.0,575.0,912.0,GasA,TA,Y,SBrkr,970,0,0,970,0.0,1.0,1,0,3,1,TA,6,Typ,0,,,,,0.0,0.0,,,Y,80,32,0,0,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [17]:
# hpc_test_after_preproc = pd.DataFrame(preprocessor.fit_transform(hpc_test))
# hpc_test_after_preproc

In [18]:
y_pred_test = pipeline.predict(test)

In [19]:
output = pd.DataFrame({'Id': hpc_test.Id,
                     'SalePrice': y_pred_test})

output_file = f'submission.csv'

output.to_csv(output_file, index=False)

print(f'{output_file} saved!')

submission.csv saved!
