In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
#데이터 불러오기
data = pd.read_csv('./Data/kc_house_data.csv')
data

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.00,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.7210,-122.319
2,5631500400,20150225T000000,180000.0,2,1.00,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.00,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.00,1.0,0,3,8,1987,0,98074,47.6168,-122.045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,3.0,0,3,8,2009,0,98103,47.6993,-122.346
21609,6600060120,20150223T000000,400000.0,4,2.50,2.0,0,3,8,2014,0,98146,47.5107,-122.362
21610,1523300141,20140623T000000,402101.0,2,0.75,2.0,0,3,7,2009,0,98144,47.5944,-122.299
21611,291310100,20150116T000000,400000.0,3,2.50,2.0,0,3,8,2004,0,98027,47.5345,-122.069


In [3]:
'''
id: 집 고유아이디
date: 집이 팔린 날짜 
price: 집 가격 (타겟변수)
bedrooms: 주택 당 침실 개수
bathrooms: 주택 당 화장실 개수
floors: 전체 층 개수
waterfront: 해변이 보이는지 (0, 1)
condition: 집 청소상태 (1~5)
grade: King County grading system 으로 인한 평점 (1~13)
yr_built: 집이 지어진 년도
yr_renovated: 집이 리모델링 된 년도
zipcode: 우편번호
lat: 위도
long: 경도
'''

'\nid: 집 고유아이디\ndate: 집이 팔린 날짜 \nprice: 집 가격 (타겟변수)\nbedrooms: 주택 당 침실 개수\nbathrooms: 주택 당 화장실 개수\nfloors: 전체 층 개수\nwaterfront: 해변이 보이는지 (0, 1)\ncondition: 집 청소상태 (1~5)\ngrade: King County grading system 으로 인한 평점 (1~13)\nyr_built: 집이 지어진 년도\nyr_renovated: 집이 리모델링 된 년도\nzipcode: 우편번호\nlat: 위도\nlong: 경도\n'

In [6]:
ncar = data.shape[0]
nvar = data.shape[1]
print(ncar,nvar)

21613 14


### 의미 없는 변수 제거

In [8]:
data = data.drop(['id','date','zipcode','lat','long'],axis=1)

KeyError: "['id' 'date' 'zipcode' 'lat' 'long'] not found in axis"

In [9]:
data

Unnamed: 0,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated
0,221900.0,3,1.00,1.0,0,3,7,1955,0
1,538000.0,3,2.25,2.0,0,3,7,1951,1991
2,180000.0,2,1.00,1.0,0,3,6,1933,0
3,604000.0,4,3.00,1.0,0,5,7,1965,0
4,510000.0,3,2.00,1.0,0,3,8,1987,0
...,...,...,...,...,...,...,...,...,...
21608,360000.0,3,2.50,3.0,0,3,8,2009,0
21609,400000.0,4,2.50,2.0,0,3,8,2014,0
21610,402101.0,2,0.75,2.0,0,3,7,2009,0
21611,400000.0,3,2.50,2.0,0,3,8,2004,0


### 범주형 변수를 이진형 변수로 변환
- 범주형 변수는 waterfront 컬럼 뿐이며, 이진 분류이기 때문에 0,1로 표현한다.
- 데이터에서 0,1로 표현되어 있으므로 과정 생략

### 설명변수와 타겟변수를 분리, 학습데이터와 평가데이터 분리

In [10]:
feature_columns = list(data.columns.difference(['price']))
x = data[feature_columns]
y = data['price']
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(15129, 8) (6484, 8) (15129,) (6484,)


### 학습 데이터를 선형 회귀 모형에 적합 후 평가 데이터로 검증

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

regression_model = LinearRegression()
linear_model1 = regression_model.fit(train_x, train_y)
pred1 = linear_model1.predict(test_x)
print(sqrt(mean_squared_error(pred1,test_y)))

239804.29670858145


### 학습 데이터를 의사결정나무모형에 적합 후 평가 데이터로 검증

In [16]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor() 
tree_model1 = decision_tree_model.fit(train_x, train_y) 
predict1 = tree_model1.predict(test_x)
print("RMSE: {}".format(sqrt(mean_squared_error(predict1, test_y)))) 

RMSE: 295334.3459614948


### Bagging을 이용하여 의사결정나무모형에 적합 후 평가

In [18]:
from sklearn.ensemble import BaggingRegressor
bagging_decision_tree_model1 = BaggingRegressor(base_estimator = decision_tree_model, 
                                                n_estimators = 5, 
                                                verbose = 1) 
tree_model2 = bagging_decision_tree_model1.fit(train_x, train_y) 
predict2 = tree_model2.predict(test_x) 
print("RMSE: {}".format(sqrt(mean_squared_error(predict2, test_y)))) 

RMSE: 237603.12514402534


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [19]:
bagging_decision_tree_model2 = BaggingRegressor(base_estimator = decision_tree_model, 
                                                n_estimators = 30, 
                                                verbose = 1) 
tree_model3 = bagging_decision_tree_model2.fit(train_x, train_y) 
predict3 = tree_model3.predict(test_x) 
print("RMSE: {}".format(sqrt(mean_squared_error(predict3, test_y)))) 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RMSE: 234207.25019389612


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
