# 모델의 성능 높이기

1. 피처 엔지니어링 하기 
2. 다양한 모델에 대해 하이퍼 파라미터 튜닝하기
3. 다양한 하이퍼 파라미터에 대해 그리드 탐색을 시도해서 최적의 조합을 찾아보기
4. Baseline 커널에서 활용했던 블렌딩 방법 활용하기

평가 지표

- 정리하기
- 스코어 110000 넘기

1. 피처 엔지니어링은 잘 모르겠음. 일단 모델이랑 파라미터를 바꿔보기

In [3]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np
from tqdm import tqdm

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

# 1. 피쳐 엔지니어링
## 데이터 불러오기

In [73]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

train_data_path = join(data_dir, 'train.csv')
test_data_path = join(data_dir, 'test.csv') 

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [79]:
train.head()

Unnamed: 0,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,201410,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,201502,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,201502,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,201406,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,201501,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [84]:
train.columns

Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')

## 전처리하기

In [75]:
# 훈련 데이터 
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
y = train['price']
del train['price']
del train['id']

# 테스트 데이터
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']

In [76]:
# 타겟 로그변환
y = np.log1p(y)

# 채점 함수 준비

In [8]:
from sklearn.model_selection import train_test_split
# RMSE점수 계산
from sklearn.metrics import mean_squared_error

# rmse 계산 함수
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [9]:
# 한 번에 여러 모델 점수보기
def get_scores(models, train, y):
    df = {}

    for model in tqdm(models):
        # 모델 이름
        model_name = model.__class__.__name__

        # 훈련,테스트 셋 분리
        X_train, X_test, y_train, y_test = train_test_split(train, 
                                                            y,
                                                            random_state=random_state,
                                                            test_size=0.2)
        # 모델 학습
        model.fit(X_train, y_train)
        # 예측
        y_pred = model.predict(X_test)
        # 예측 결과의 rmse값을 저장
        df[model_name] = rmse(y_test, y_pred)
        # data frame에 저장
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
        
    return score_df

In [10]:
# 한 모델 점수만 보기
def get_score(model, train, y):
    model_name = model.__class__.__name__
    
    # 훈련,테스트 셋 분리
    X_train, X_test, y_train, y_test = train_test_split(train, 
                                                        y,
                                                        random_state=random_state,
                                                        test_size=0.2)
    # 모델 학습
    model.fit(X_train, y_train)
    # 예측
    y_pred = model.predict(X_test)
    return rmse(y_test, y_pred)

In [11]:
# 에버리지 블랜딩 점수 보기
from tqdm import tqdm

def AveragingBlendingGetScore(models, train, y):   
    # 훈련,테스트 셋 분리
    X_train, X_test, y_train, y_test = train_test_split(train, 
                                                        y,
                                                        random_state=random_state,
                                                        test_size=0.2)
    
    for m in tqdm(models):    
        m['model'].fit(X_train.values, y_train)
    
    predictions = np.column_stack([
        m['model'].predict(X_test.values) for m in models
    ])
    y_pred = np.mean(predictions, axis=1)
    return rmse(y_test, y_pred)

# 하이퍼파라미터 탐색

In [17]:
# 모델 
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from sklearn.model_selection import GridSearchCV

def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):     
    # 1. 모델 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid,
                              scoring='neg_mean_squared_error', 
                              cv=3, verbose=verbose, n_jobs=n_jobs)
    # 2. 모델 훈련
    grid_model.fit(train, y)
        
    # 3. 각 조합 결과 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']

    # 4. 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values(by=['RMSLE'])

    return results

random_state=2022

### 파라미터 기록

121744.07926966541
learning_rate	max_depth	n_estimators	num_leaves	   score	  RMSLE
0.05	        10	        500	            32	         -0.025869	 0.160837

LGBMRegressor(n_estimators=500, max_depth=10, learning_rate=0.05,
                         num_leaves=32, random_state=random_state)

max_depth	min_samples_leaf	min_samples_split	n_estimators	score	RMSLE
15	        8	                8	                200      	-0.034792	0.186525

RandomForestRegressor(max_depth=15,min_samples_leaf=8, min_samples_split=8,
                      n_estimators=200,random_state=random_state)

127521.80309206048
eta	max_depth	score	RMSLE
0.1	7	     -0.027732	0.16653
XGBRegressor(max_depth=7, eta=0.1, random_state=random_state)

119086.80687715334
learning_rate	max_depth	max_features	n_estimators	score	RMSLE
0.2	            6	        5 	            100         -0.029621	0.172109

GradientBoostingRegressor(max_depth=6, n_estimators=55,max_features=5,learning_rate=0.2,random_state=random_state)

### 그리드 서치

In [114]:
param_grid = {
   'learning_rate' : [0.05],
    'max_depth' : [10],
    'n_estimators' : [500],
    'num_leaves' : [32],
}

model = LGBMRegressor(random_state=random_state)
my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Unnamed: 0,learning_rate,max_depth,n_estimators,num_leaves,score,RMSLE
0,0.05,10,500,32,-0.026341,0.162299


# 결과 저장 함수

In [None]:
def save_submission(model, train, y, test, model_name, rmsle):
    # 1. 모델을 학습
    model.fit(train, y)
    
    # 2. test에 대해 예측
    prediction = model.predict(test)
    
    # 3. 예측값을 변환
    prediction = np.expm1(prediction)
    
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    # 4. 샘플에 저장
    submission['price'] = prediction
    # 5. 파일 저장
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print(f'{submission_csv_path}저장 완료!')

In [316]:
def ab_save_submission(model, train, y, test, rmsle): 
    for m in tqdm(models):    
        m['model'].fit(train.values, y)
    
    predictions = np.column_stack([
        m['model'].predict(test.values) for m in models
    ])
    predictions = np.mean(predictions, axis=1)
    predictions = np.expm1(predictions)
    
    # 제출 예시 불러오기 
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)

    # 샘플에 덮어씌우기
    submission['price'] = prediction

    # 저장하기
    submission_csv_path = '{}/submission_AB_RMSLE_{}.csv'.format(data_dir, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print(f'{submission_csv_path}저장 완료!')

# 모델 점수 확인

In [115]:
# 찾은 파라미터로 모델 점수 확인
model = LGBMRegressor(n_estimators=500, max_depth=10, learning_rate=0.05,
                         num_leaves=32, random_state=random_state)

get_score(model, train, y)

121744.07926966541

In [69]:
# 파라미터를 적용해서 에버리지 블랜딩으로 점수 보기
gboost = GradientBoostingRegressor(max_depth=6, n_estimators=55,max_features=5,learning_rate=0.2,random_state=random_state)
xgboost = XGBRegressor(max_depth=7, eta=0.1, random_state=random_state)
lightgbm = LGBMRegressor(n_estimators=500, max_depth=10, learning_rate=0.05,
                         num_leaves=32, random_state=random_state)

models = [{'model':gboost}, {'model':xgboost}, {'model':lightgbm}]

# 점수 보기
AveragingBlendingGetScore(models, train, y)

100%|██████████| 3/3 [00:03<00:00,  1.10s/it]


119878.35614767023

In [67]:
# 여러 모델 한 번에 점수보기
gboost = GradientBoostingRegressor(max_depth=6, n_estimators=55,max_features=5,learning_rate=0.2,random_state=random_state)
xgboost = XGBRegressor(max_depth=5, eta=0.1, random_state=random_state)
lightgbm = LGBMRegressor(n_estimators=500, max_depth=10, learning_rate=0.05,
                         num_leaves=32, random_state=random_state)
rdforest = RandomForestRegressor(max_depth=10, n_estimators=300,random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

get_scores(models, train, y)

100%|██████████| 3/3 [00:02<00:00,  1.02it/s]


Unnamed: 0,RMSE
XGBRegressor,131733.913453
LGBMRegressor,121744.07927
GradientBoostingRegressor,119086.806877


In [315]:
# 에버리지 블랜딩 점수 저장
ab_save_submission(model, train, y, test, rmsle='0.160837')

100%|██████████| 4/4 [00:34<00:00,  8.56s/it]


6468


array([ 506131.23152961,  471666.36517738, 1360642.38108483, ...,
        453949.05627313,  339230.63987576,  436905.84529462])

# 최종 점수

In [117]:
# 한 모델 최종 훈련과 결과 저장 함수
model = GradientBoostingRegressor(max_depth=6, n_estimators=55,max_features=5,learning_rate=0.2,random_state=random_state)

save_submission(model, train, y, test, 'gboost', rmsle='0.172109')

/aiffel/aiffel/kaggle_kakr_housing/data/submission_lgbm_RMSLE_0.160837.csv저장 완료!
[CV] END learning_rate=0.05, max_depth=10, n_estimators=100, num_leaves=32; total time=   0.7s
[CV] END learning_rate=0.05, max_depth=10, n_estimators=300, num_leaves=32; total time=   2.1s
[CV] END learning_rate=0.05, max_depth=10, n_estimators=500, num_leaves=32; total time=   2.7s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=32; total time=   0.7s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=100, num_leaves=32; total time=   0.6s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=300, num_leaves=32; total time=   1.6s
[CV] END learning_rate=0.1, max_depth=10, n_estimators=500, num_leaves=32; total time=   3.3s
[CV] END learning_rate=0.05, max_depth=10, n_estimators=100, num_leaves=64; total time=   1.0s
[CV] END learning_rate=0.05, max_depth=10, n_estimators=300, num_leaves=64; total time=   2.3s
[CV] END learning_rate=0.05, max_depth=10, n_estimators=500, num_lea

# 마무리

여러 시도를 해봤는데 지금은 스코어를 넘지는 못할 것 같다. 아직 EDA를 어떻게 해야 할지 크게 감이 오지는 않는다. 일단 그리드서치로 하이퍼파라미터 탐색을 위주로 성능을 높혀봤고 에버리지 블랜딩도 이용해봤다. 파라미터 조정으로는 한계가 있고 다른 방향의 조정이 필요해 보인다. 앞으로 배워나가야 할 것 같다. 