# 9. 모델링

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 준비

In [2]:
data_class1 = pd.read_csv("C:/Users/Yoo/Documents/Capstone/data_class1.csv")

In [3]:
def data_split(data):
    data_class0_train = data[data['time'].between(0,19)]
    data_class0_test = data[data['time'].between(20,23)]
    
    X_train = data_class0_train.drop(['price'], axis=1)
    X_test = data_class0_test.drop(['price'], axis=1)
    y_train = data_class0_train['price']
    y_test = data_class0_test['price']
    
    return (X_train, X_test, y_train, y_test)

In [4]:
X_train, X_test, y_train, y_test = data_split(data_class1)

In [5]:
address_list = {'도로명시군구코드', '도로명건물본번호코드', '도로명건물부번호코드', '도로명', '아파트', '행정동', 'cluster', 'class'}

X_address = X_test[address_list]

X_train = X_train[X_train.columns.difference(address_list)]
X_test = X_test[X_test.columns.difference(address_list)]

In [6]:
def reg_accuracy(y_test, y_pred, percent):
    y_test = np.array(y_test)
    y_pred_temp = y_pred.round().astype('int64')

    for i in range(len(y_test)):
        if (abs((y_test[i] - y_pred_temp[i]) / y_test[i]))*100 < percent:
            y_pred_temp[i] = y_test[i]
            continue
            
    print(accuracy_score(y_test, y_pred_temp))

## 랜덤 포레스트

In [7]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [8]:
from sklearn.ensemble import RandomForestRegressor
 
rnd_reg = RandomForestRegressor(n_jobs=-1, random_state=42)
rnd_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [9]:
y_pred_rnd = rnd_reg.predict(X_test)

In [10]:
reg_accuracy(y_test,y_pred_rnd,3)

0.419585346215781


In [11]:
np.sqrt(mean_squared_error(y_test, y_pred_rnd))

8285.308437412876

## 엑스트라 트리

In [12]:
from sklearn.tree import ExtraTreeRegressor

ext_reg = ExtraTreeRegressor(random_state=42)
ext_reg.fit(X_train, y_train)

ExtraTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                   max_features='auto', max_leaf_nodes=None,
                   min_impurity_decrease=0.0, min_impurity_split=None,
                   min_samples_leaf=1, min_samples_split=2,
                   min_weight_fraction_leaf=0.0, random_state=42,
                   splitter='random')

In [13]:
y_pred_ext = ext_reg.predict(X_test)

In [14]:
reg_accuracy(y_test,y_pred_ext,3)

0.31672705314009664


In [15]:
np.sqrt(mean_squared_error(y_test, y_pred_ext))

13211.982707280076

## XG부스트

In [20]:
import xgboost

xgb_reg = xgboost.XGBRegressor(random_state=42)
xgb_reg.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [21]:
y_pred_xgb = xgb_reg.predict(X_test)

In [22]:
reg_accuracy(y_test,y_pred_xgb,3)

0.21457326892109502


In [23]:
np.sqrt(mean_squared_error(y_test, y_pred_xgb))

13806.097267349496

## LGBM

In [10]:
from lightgbm import LGBMRegressor

lgb_reg = LGBMRegressor(random_state=42)
lgb_reg.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [11]:
y_pred_lgb = lgb_reg.predict(X_test)

In [12]:
reg_accuracy(y_test,y_pred_lgb,3)

0.31237721021611004


In [13]:
np.sqrt(mean_squared_error(y_test, y_pred_lgb))

22694.752741919743

In [20]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[1500,1750], 
              'learning_rate':[0.05, 0.1], 
              'max_depth':[2,3,4],
              'min_child_samples':[7,9,10]}

In [None]:
grid_dtree = GridSearchCV(lgb_reg, param_grid=parameters, cv=3, scoring='neg_mean_squared_error', n_jobs=1)
grid_dtree.fit(X_train, y_train)

In [None]:
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df.head(3)

In [17]:
print('GridSearchCV 최적 파라미터:', grid_dtree.best_params_)
print('GridSearchCV 최고 RMSE: {0:.4f}'.format(np.sqrt(np.abs(grid_dtree.best_score_))))

GridSearchCV 최적 파라미터: {'learning_rate': 0.1, 'max_depth': 3, 'min_child_samples': 9, 'n_estimators': 1500}
GridSearchCV 최고 RMSE: 18864.6065


In [18]:
estimator = grid_dtree.best_estimator_
y_pred_lgb = estimator.predict(X_test)

np.sqrt(mean_squared_error(y_test, y_pred_lgb))

22216.019993980823

In [19]:
X_class1_result = X_test
X_class1_result['real'] = y_test
X_class1_result['pred'] = y_pred_lgb

X_class1_result = pd.concat([X_class1_result, X_address], axis=1)

In [28]:
X_class1_result.head(3)

Unnamed: 0,A_benefit,A_bottom,A_par_all,A_par_apt,A_pop_all,A_pop_for,A_top,D_esch,D_factory,D_hos,...,pred,real,도로명,도로명건물본번호코드,아파트,cluster,class,도로명시군구코드,도로명건물부번호코드,행정동
6,491,0,15777,5269,27040,2994,1,0,0,0,...,195175.648531,208500,압구정로,151,신현대12차,신사동_대,1,11680,0,신사동
20,69,0,25539,6021,15800,329,1,0,0,0,...,170632.15378,155000,학동로88길,5,진흥아파트,삼성1동_대,1,11680,0,삼성1동
28,491,0,15777,5269,27040,2994,1,0,0,0,...,190377.822055,180000,압구정로11길,17,미성1차,신사동_대,1,11680,0,신사동


In [30]:
X_class1_result.to_excel('class1_result.xlsx', index = False)