In [1]:
%matplotlib inline
import warnings
from matplotlib import font_manager, rc

# 경고메세지 끄기
warnings.filterwarnings(action='ignore')

# 한글 처리를 위해 폰트 설정
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

In [2]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

boston = load_boston()
dfX = pd.DataFrame(boston.data, columns=boston.feature_names)
dfy = pd.DataFrame(boston.target, columns=["MEDV"])
df = pd.concat([dfX, dfy], axis=1)
# 학습용과 검증용을 7:3으로 구분
N = len(df)
ratio = 0.7
np.random.seed(0)
idx_train = np.random.choice(np.arange(N), np.int64(ratio * N), replace=False)
idx_test = list(set(np.arange(N)).difference(idx_train))

df_train = df.iloc[idx_train]
df_test = df.iloc[idx_test]

In [3]:
import statsmodels.api as sm

model = sm.OLS.from_formula("MEDV ~ " + "+".join(boston.feature_names), data=df_train)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.728
Model:                            OLS   Adj. R-squared:                  0.718
Method:                 Least Squares   F-statistic:                     70.06
Date:                Thu, 09 Mar 2023   Prob (F-statistic):           8.57e-88
Time:                        10:59:14   Log-Likelihood:                -1043.0
No. Observations:                 354   AIC:                             2114.
Df Residuals:                     340   BIC:                             2168.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     35.0719      5.932      5.913      0.0

In [4]:
# 검증용 데이터셋으로 모형 평가
pred = result.predict(df_test)
# Residual Sum of Square(잔차의 분산, 오차의 크기)
rss = ((df_test.MEDV - pred) ** 2).sum()
# Total Sum of Square(종속변수 y의 분산)
tss = ((df_test.MEDV - df_test.MEDV.mean()) ** 2).sum()
# 결정계수 : 모형의 설명력(0~1 사이의 값)
rsquared = 1 - rss / tss
rsquared

0.7519796502601109

In [5]:
from sklearn.model_selection import train_test_split

# 학습용, 검증용으로 구분
df_train, df_test =  train_test_split(df, test_size=0.3, random_state=0)
df_train.shape, df_test.shape

((354, 14), (152, 14))

In [6]:
# 학습용X,y 검증용 X,y로 구분
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(dfX, dfy, test_size=0.3, random_state=0)
dfX_train.shape, dfy_train.shape, dfX_test.shape, dfy_test.shape

((354, 13), (354, 1), (152, 13), (152, 1))

In [7]:
from sklearn.model_selection import KFold

scores = np.zeros(5)
cv = KFold(5, shuffle=True, random_state=0)
for i, (idx_train, idx_test) in enumerate(cv.split(df)):
    df_train = df.iloc[idx_train]
    df_test = df.iloc[idx_test]

    model = sm.OLS.from_formula("MEDV ~ " + "+".join(boston.feature_names), data=df_train)
    result = model.fit()

    pred = result.predict(df_test)
    rss = ((df_test.MEDV - pred) ** 2).sum()
    tss = ((df_test.MEDV - df_test.MEDV.mean()) ** 2).sum()
    rsquared = 1 - rss / tss

    scores[i] = rsquared
    print(f"학습용 R2 = {result.rsquared:.3f}, 검증용 R2 = {rsquared:.3f}")

학습용 R2 = 0.773, 검증용 R2 = 0.589
학습용 R2 = 0.729, 검증용 R2 = 0.778
학습용 R2 = 0.749, 검증용 R2 = 0.668
학습용 R2 = 0.757, 검증용 R2 = 0.668
학습용 R2 = 0.705, 검증용 R2 = 0.840


In [8]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

scores1 = np.zeros(5)
scores2 = np.zeros(5)
scores3 = np.zeros(5)
cv = KFold(5, shuffle=True, random_state=0)
for i, (idx_train, idx_test) in enumerate(cv.split(df)):
    df_train = df.iloc[idx_train]
    df_test = df.iloc[idx_test]

    model = sm.OLS.from_formula("MEDV ~ " + "+".join(boston.feature_names), data=df_train)
    result = model.fit()

    pred = result.predict(df_test)
    # 결정계수를 구하는 함수
    rsquared = r2_score(df_test.MEDV, pred)
    scores1[i] = rsquared
    # 평균제곱오차(Mean Squared Error) - 오차의 제곱의 합계의 평균값
    mse = mean_squared_error(df_test.MEDV, pred)
    scores2[i] = mse
    # 평균절대오차(Mean Absolute Error) - 오차의 합계의 평균값
    mae = mean_absolute_error(df_test.MEDV, pred)
    scores3[i] = mae

print(scores1)
print(scores2)
print(scores3)


[0.58922238 0.77799144 0.66791979 0.6680163  0.83953317]
[33.44898    18.65881615 21.23463289 29.22251557 16.57369039]
[3.84290922 3.38979394 3.07473854 3.6463452  3.03058651]


In [9]:
from sklearn.base import BaseEstimator, RegressorMixin
import statsmodels.formula.api as smf
import statsmodels.api as sm

class StatsmodelsOLS(BaseEstimator, RegressorMixin):
    def __init__(self, formula):
        self.formula = formula
        self.model = None
        self.data = None
        self.result = None
    
    def fit(self, dfX, dfy):
        self.data = pd.concat([dfX, dfy], axis=1)
        self.model = smf.ols(self.formula, data=self.data)
        self.result = self.model.fit()
    
    def predict(self, new_data):
        return self.result.predict(new_data)

In [10]:
from sklearn.model_selection import cross_val_score

model = StatsmodelsOLS("MEDV ~ " + "+".join(boston.feature_names))
cv = KFold(5, shuffle=True, random_state=0)
cross_val_score(model, dfX, dfy, scoring="r2", cv=cv)

# 평균제곱오차로 평가하는 경우
result = cross_val_score(model, dfX, dfy, scoring='neg_mean_squared_error', cv=cv)
# 음수로 나온 결과값을 양수로 변환
rmse_score = np.sqrt(-result)
rmse_score

array([5.78350932, 4.31958518, 4.60810513, 5.40578538, 4.07107976])