# 세팅부분

In [182]:
from google.colab import files
uploaded = files.upload()

Saving train (1).csv to train (1).csv
Saving test.csv to test.csv


In [118]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# 1. 데이터 전처리

### 데이터 준비

In [183]:
data = pd.read_csv("train.csv")
sub = pd.read_csv("test.csv")


In [184]:
print(sub.shape)

(6468, 20)


In [185]:
y = data['price']

del data['price']

##### train데이터와 test 데이터셋에 전처리 해주기 위해 train데이터셋과 test데이터셋 concat하기

In [186]:
#나중에 다시 분리하기 위해 train데이터셋 만의 길이 저장해두기
train_len = len(data) 
data = pd.concat((data,sub),axis=0) # 즉 행으로 이어 붙인다.
print(train_len)
print(len(data))
data.head()

15035
21503


Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,20141013T000000,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,1,20150225T000000,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,2,20150218T000000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,3,20140627T000000,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,4,20150115T000000,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


### 간단한 전처리

#### id 칼럼 제거

In [187]:
sub_id = data['id'][train_len:]
del data['id']

#### date 칼럼을 연도와 달만 가져오기

In [188]:
data['date'] = data['date'].apply(lambda i :i[:6]).astype(int)

#### 데이터 분포를 그려보고 한쪽으로 몰려있는 값들에 log함수를 취해주자.

In [189]:
skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_lot15', 'sqft_living15']
for c in skew_columns:
  data[c] = np.log1p(data[c].values)

#### 전처리 후 데이터 다시 나눠주기

In [190]:
train = data.iloc[:train_len,:]
sub = data.iloc[train_len:,:]


In [191]:
print(train.shape)
print(y.shape)

(15035, 19)
(15035,)


#### train데이터셋의 타겟값들에 log 해주기

In [192]:
y = np.log1p(y)

# 2. LGBMRegressor 하이퍼 파라미터 정의 및 그리드탐색으로 최적 조합 찾기

In [193]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor


def rmse(y_test,y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test),np.expm1(y_pred)))


random_state = 2020

#### 하이퍼 파라미터 정의

In [213]:
param_grid = {
    'n_estimators' : range(5, 50, 10),
  'max_features' : range(16, 18),
  'max_depth': range(3, 10),
  'learning_rate' : np.linspace(0.01, 0.2, 10)
}

#### 그리드 서치 함수 정의 -> 어떤 파라미터가 제일 나은 결과를 도출하는지 찾기

In [195]:
def my_GridSearch(model, train, y, param_grid,verbose=2, n_jobs=5):
    model = LGBMRegressor(random_state=random_state)
    grid_model = GridSearchCV(model,
                         param_grid=param_grid,\
                         scoring='neg_mean_squared_error',\
                         cv=5,verbose=1,n_jobs=5)
    grid_model.fit(train,y)
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    results = pd.DataFrame(params)
    results['score'] = score
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results.sort_values(by=['RMSLE'])
    
    return results

In [214]:
model = LGBMRegressor(random_state=random_state)
my_GridSearch(model,train,y,param_grid,verbose=2,n_jobs=5)

Fitting 5 folds for each of 700 candidates, totalling 3500 fits


Unnamed: 0,learning_rate,max_depth,max_features,n_estimators,score,RMSLE
0,0.01,3,16,5,-0.260805,0.510691
1,0.01,3,16,15,-0.230038,0.479623
2,0.01,3,16,25,-0.204301,0.451997
3,0.01,3,16,35,-0.182595,0.427311
4,0.01,3,16,45,-0.164219,0.405239
...,...,...,...,...,...,...
695,0.20,9,17,5,-0.075154,0.274143
696,0.20,9,17,15,-0.034082,0.184613
697,0.20,9,17,25,-0.029941,0.173034
698,0.20,9,17,35,-0.028658,0.169287


#### 결과 정의 함수 정의 -> 제일 성능이 좋은 모델로 결과 도출하기

In [208]:
def save_submission(model, train, y, test, model_name, rmsle):
    model.fit(train,y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    #data_dir = os.getenv("HOME")+'/aiffel/kaggle_kakr_housing/data'
    #submission_path = join(data_dir,'sample_submission.csv')
    submission = pd.read_csv("sample_submission.csv")
    print(submission.shape)
    print(prediction.shape)
    submission['price'] = prediction
    submission_csv_path = 'submission_{}_RMSLE_{}.csv'.format(model_name,rmsle)
    submission.to_csv(submission_csv_path,index=False)
    print('{} saved!!'.format(submission_csv_path))

In [198]:
print(sub.shape)

(6468, 19)


In [209]:
save_submission(model,train,y,sub,'lgbm',rmsle='0.0.164391')

(6468, 2)
(6468,)
submission_lgbm_RMSLE_0.0.164391.csv saved!!


# 3. XGBRegressor 하이퍼 파라미터 정의 및 그리드탐색으로 최적 조합 찾기

# 4. RandomForestRegressor 하이퍼 파라미터 정의 및 그리드탐색으로 최적 조합 찾기

# 5. Baseline 커널에서 활용했던 블렌딩 방법 활용하기