In [47]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
import warnings  
warnings.filterwarnings('ignore')
from matplotlib import font_manager, rc
import platform

if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/HMFMMUEX.ttc").get_name()
    rc('font', family=font_name)
else:    
# Mac 인 경우
    rc('font', family='AppleGothic')

%matplotlib inline

In [49]:
data = pd.read_excel('1year.xlsx')
data_ = data.drop(['본번'],axis = 1)
df = pd.DataFrame(data_,index = data['본번'])
df

Unnamed: 0_level_0,전용면적(㎡),거래금액(만원),층,건축년도
본번,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
639,59.7550,11500,1,1999
639,59.7550,11500,1,1999
639,59.7550,11500,1,1999
639,59.7550,11500,1,1999
639,59.7550,11500,1,1999
...,...,...,...,...
870,102.1170,29000,10,1992
870,102.1170,29000,10,1992
373,59.4100,12000,3,1991
688,84.6339,21000,20,2005


In [50]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score

# feature, target 데이터 분리
y_target = df['거래금액(만원)']
X_data = df.drop(['거래금액(만원)'], axis=1, inplace=False)

# train, test 데이터 분리
X_train , X_test , y_train , y_test = train_test_split(X_data , y_target ,test_size=0.3, random_state=156)

# Linear Regression 
lr = LinearRegression()

# 학습
lr.fit(X_train , y_train )

# 예측
y_preds = lr.predict(X_test)

# 평가
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

print('MSE : {0:.3f} , RMSE : {1:.3F}'.format(mse , rmse))
print('Variance score : {0:.3f}'.format(r2_score(y_test, y_preds)))

MSE : 44533632.815 , RMSE : 6673.352
Variance score : 0.666


In [51]:
# 회귀 계수를 큰 값 순으로 정렬하기 위해 Series로 생성. index가 컬럼명에 유의
coeff = pd.Series(data=np.round(lr.coef_, 1), index=X_data.columns )
coeff.sort_values(ascending=False)

건축년도       553.8
층          426.8
전용면적(㎡)    234.4
dtype: float64

In [52]:
from sklearn.model_selection import cross_val_score

# features, target 데이터 정의
y_target = df['거래금액(만원)']
X_data = df.drop(['거래금액(만원)'], axis=1, inplace=False)

# 선형 회귀 객체 생성
lr = LinearRegression()
lr

LinearRegression()

In [53]:
# 5 folds 의 개별 Negative MSE scores
neg_mse_scores = cross_val_score(lr, X_data, y_target, scoring="neg_mean_squared_error", cv = 5)
neg_mse_scores

array([-40425298.78121763, -45792001.30083339, -44554361.16129865,
       -41935873.57667971, -51697549.66769119])

In [54]:
# RMSE를 구하기 위해서는 MSE값들에 -1을 곱한 후 평균을 내면 된다.
rmse_scores  = np.sqrt( -1 * neg_mse_scores )
rmse_scores

array([6358.0892398 , 6766.97874245, 6674.90532976, 6475.79134753,
       7190.10081068])

In [55]:
# 5 folds 의 평균 RMSE
avg_rmse = np.mean(rmse_scores)
avg_rmse

6693.173094043166

In [56]:
# cross_val_score(scoring="neg_mean_squared_error")로 반환된 값은 모두 음수 
print(' 5 folds 의 개별 Negative MSE scores: ', np.round(neg_mse_scores, 2))
print(' 5 folds 의 개별 RMSE scores : ', np.round(rmse_scores, 2))
print(' 5 folds 의 평균 RMSE : {0:.3f} '.format(avg_rmse))

 5 folds 의 개별 Negative MSE scores:  [-40425298.78 -45792001.3  -44554361.16 -41935873.58 -51697549.67]
 5 folds 의 개별 RMSE scores :  [6358.09 6766.98 6674.91 6475.79 7190.1 ]
 5 folds 의 평균 RMSE : 6693.173 
