# House Prices - Advanced Regression Techniques

## Predict sales prices and practice feature engineering, RFs, and gradient boosting

## 1. 라이브러리 선언

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
!pip install lightgbm
from lightgbm import LGBMRegressor
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization
import numpy as np



## 2. EDA & 전처리

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
print('train_shape:', train.shape)
print('test_shape:', test.shape)

train_shape: (1460, 81)
test_shape: (1459, 80)


## 2-1. 결측치 확인

In [6]:
train.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

## 2-2. 결측치의 양이 40%가 넘는 칼럼 제거

In [7]:
num_null = train.isnull().sum() / len(train)
remove_col = num_null[num_null >= 0.4]
print(remove_col)

Alley          0.937671
FireplaceQu    0.472603
PoolQC         0.995205
Fence          0.807534
MiscFeature    0.963014
dtype: float64


In [8]:
train = train.drop(remove_col.keys(), axis = 1)
test = test.drop(remove_col.keys(), axis = 1)
print('train_shape:', train.shape)
print('test_shape:', test.shape)

train_shape: (1460, 76)
test_shape: (1459, 75)


In [9]:
X = train.drop(['SalePrice'], axis = 1)
y = train['SalePrice']
print('X_shape:', X.shape)
print('y_shape:', y.shape)

X_shape: (1460, 75)
y_shape: (1460,)


## 2-3. X 데이터와 test 데이터를 합친 뒤 칼럼을 문자와 숫자로 나누고 문자인 칼럼만 One-Hot Encoder 사용

In [10]:
boston = pd.concat((X, test), axis = 0)
print('boston_shape:', boston.shape)

boston_shape: (2919, 75)


In [11]:
boston_obj = boston.select_dtypes(include = 'object')
boston_num = boston.select_dtypes(exclude = 'object')
print('object_col:', boston_obj.columns)
print('num_col:', boston_num.columns)
print('object_shape:', boston_obj.shape)
print('num_shape:', boston_num.shape)

object_col: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')
num_col: Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fire

In [12]:
boston_dum = pd.get_dummies(boston_obj,drop_first = True)
print('boston_dum_shape:', boston_dum.shape)

boston_dum_shape: (2919, 196)


In [13]:
boston = pd.concat([boston_dum,boston_num], axis = 1)
print('boston_shape:', boston.shape)
print('boston_len:', len(boston))

boston_shape: (2919, 233)
boston_len: 2919


In [14]:
train_one = boston[:len(X)]
test_one = boston[len(X):]
print('train_one_shape:', train_one.shape)
print('test_one_shape:', test_one.shape)
print('train_one_head:', train_one.head())
print('test_one_head:', test_one.head())

train_one_shape: (1460, 233)
test_one_shape: (1459, 233)
train_one_head:    MSZoning_FV  MSZoning_RH  MSZoning_RL  MSZoning_RM  Street_Pave  \
0            0            0            1            0            1   
1            0            0            1            0            1   
2            0            0            1            0            1   
3            0            0            1            0            1   
4            0            0            1            0            1   

   LotShape_IR2  LotShape_IR3  LotShape_Reg  LandContour_HLS  LandContour_Low  \
0             0             0             1                0                0   
1             0             0             1                0                0   
2             0             0             0                0                0   
3             0             0             0                0                0   
4             0             0             0                0                0   

   ...  GarageArea 

In [15]:
X = train_one

## 3. LGBMRegressor의 하이퍼파라미터를 BayesianOptimization으로 결정 

### 3-1. 하이퍼파라미터 범위

In [16]:
lgbm_parameter_bounds = {
    'learning_rate':(0.0001,0.01),
    'n_estimators' : (10,1000),
    'max_depth' : (1,10),
    'subsample' : (0.1,1),
    'colsample_bytree' : (0.5,1),
    'num_leaves': (24, 45),
    }

In [17]:
def lgbm_bo(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, num_leaves):
    lgbm_params = {
        'learning_rate' : float(learning_rate),
        'n_estimators' : int(n_estimators),
        'max_depth' : int(max_depth),
        'subsample' : float(subsample),
        'colsample_bytree' : float(colsample_bytree),
        'num_leaves' : int(num_leaves),
    }
    lgbm = LGBMRegressor(**lgbm_params)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
    lgbm.fit(X_train, y_train)
    mse_score = cross_val_score(lgbm, X_valid, y_valid, scoring = 'neg_mean_squared_error', cv = 5)
    score = np.mean(mse_score)
    return score

### 3-2. 점수가 제일 높은 하이퍼파라미터로 LGBMRegressor 적용

In [19]:
BO_lgbm = BayesianOptimization(f = lgbm_bo, pbounds = lgbm_parameter_bounds, random_state = 0)

#학습 시작
BO_lgbm.maximize(init_points = 10, n_iter = 100)

max_params = BO_lgbm.max['params']
max_params['learning_rate'] = float(max_params['learning_rate'])
max_params['n_estimators'] = int(max_params['n_estimators'])
max_params['max_depth'] = int(max_params['max_depth'])
max_params['subsample'] = float(max_params['subsample'])
max_params['colsample_bytree'] = float(max_params['colsample_bytree'])
max_params['num_leaves'] = int(max_params['num_leaves'])
print(max_params)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.834e+0[0m | [0m 0.7744  [0m | [0m 0.00718 [0m | [0m 6.425   [0m | [0m 549.4   [0m | [0m 32.9    [0m | [0m 0.6813  [0m |
| [95m 2       [0m | [95m-1.157e+0[0m | [95m 0.7188  [0m | [95m 0.008929[0m | [95m 9.673   [0m | [95m 389.6   [0m | [95m 40.63   [0m | [95m 0.576   [0m |
| [0m 3       [0m | [0m-2.314e+0[0m | [0m 0.784   [0m | [0m 0.009263[0m | [0m 1.639   [0m | [0m 96.26   [0m | [0m 24.42   [0m | [0m 0.8494  [0m |
| [95m 4       [0m | [95m-9.565e+0[0m | [95m 0.8891  [0m | [95m 0.008713[0m | [95m 9.808   [0m | [95m 801.2   [0m | [95m 33.69   [0m | [95m 0.8025  [0m |
| [0m 5       [0m | [0m-1.523e+0[0m | [0m 0.5591  [0m | [0m 0.006435[0m | [0m 2.29    [0m | [0m 945.2   [0m | [0m 34.96   [0m 

| [0m 52      [0m | [0m-1.676e+0[0m | [0m 0.9508  [0m | [0m 0.00595 [0m | [0m 5.721   [0m | [0m 484.8   [0m | [0m 40.94   [0m | [0m 0.9447  [0m |
| [0m 53      [0m | [0m-8.499e+0[0m | [0m 0.5435  [0m | [0m 0.00674 [0m | [0m 4.441   [0m | [0m 823.1   [0m | [0m 41.28   [0m | [0m 0.8678  [0m |
| [0m 54      [0m | [0m-1.125e+0[0m | [0m 0.5314  [0m | [0m 0.005631[0m | [0m 2.112   [0m | [0m 780.3   [0m | [0m 41.31   [0m | [0m 0.6751  [0m |
| [0m 55      [0m | [0m-3.068e+0[0m | [0m 0.7347  [0m | [0m 0.001013[0m | [0m 4.977   [0m | [0m 957.9   [0m | [0m 35.04   [0m | [0m 0.5515  [0m |
| [0m 56      [0m | [0m-1.641e+0[0m | [0m 0.6326  [0m | [0m 0.003608[0m | [0m 6.579   [0m | [0m 484.9   [0m | [0m 41.89   [0m | [0m 0.7965  [0m |
| [0m 57      [0m | [0m-8.125e+0[0m | [0m 0.7255  [0m | [0m 0.007666[0m | [0m 9.251   [0m | [0m 622.1   [0m | [0m 34.02   [0m | [0m 0.3018  [0m |
| [0m 58      [0m | [0m-8

| [0m 103     [0m | [0m-4.39e+09[0m | [0m 0.5499  [0m | [0m 0.001641[0m | [0m 4.859   [0m | [0m 174.0   [0m | [0m 33.11   [0m | [0m 0.3893  [0m |
| [0m 104     [0m | [0m-2.235e+0[0m | [0m 0.7266  [0m | [0m 0.002696[0m | [0m 1.922   [0m | [0m 438.7   [0m | [0m 42.81   [0m | [0m 0.4977  [0m |
| [0m 105     [0m | [0m-1.604e+0[0m | [0m 0.749   [0m | [0m 0.008429[0m | [0m 6.054   [0m | [0m 350.9   [0m | [0m 32.64   [0m | [0m 0.2426  [0m |
| [0m 106     [0m | [0m-1.023e+0[0m | [0m 0.6241  [0m | [0m 0.008933[0m | [0m 9.55    [0m | [0m 708.2   [0m | [0m 26.94   [0m | [0m 0.4288  [0m |
| [0m 107     [0m | [0m-4.376e+0[0m | [0m 0.6074  [0m | [0m 0.000441[0m | [0m 2.408   [0m | [0m 864.3   [0m | [0m 43.27   [0m | [0m 0.198   [0m |
| [0m 108     [0m | [0m-2.567e+0[0m | [0m 0.6143  [0m | [0m 0.004972[0m | [0m 9.895   [0m | [0m 222.0   [0m | [0m 27.85   [0m | [0m 0.569   [0m |
| [0m 109     [0m | [0m-2

In [22]:
lgbm_tune = LGBMRegressor(learning_rate = 0.008952243102064422, max_depth = 9, n_estimators = 504, num_leaves = 31, subsample = 0.7749632785399652, colsample_bytree = 0.8238472358016229)
lgbm_tune.fit(X, y)

LGBMRegressor(colsample_bytree=0.8238472358016229,
              learning_rate=0.008952243102064422, max_depth=9, n_estimators=504,
              subsample=0.7749632785399652)

### 3-3. 결과 파일 생성

In [23]:
pred = lgbm_tune.predict(test_one)
sub = pd.read_csv('sample_submission.csv')
sub['SalePrice'] = pred
sub.to_csv('sub_lgbm_4.csv',index = False)

## 최종 Score: 0.13204