# 선형 회귀식의 계수를 찾는 법 - OLS VS. SGD
- 보스턴 집값 데이터 활용(RM VS Price)

### 필요한 모듈 import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 필요한 라이브러리 import 
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


# 1. LinearRegression 모델을 사용한 경우

In [2]:
from sklearn.datasets import load_boston

# 데이터 수집
boston = load_boston()

import pandas as pd

df =pd.DataFrame(boston.data, columns=boston.feature_names)

X = pd.DataFrame(df['RM'])
y = boston.target

# 전체 데이터 중 80%는 학습용, 20%는 검증용으로 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,\
                                                   random_state=1)
import numpy as np
from sklearn.linear_model import LinearRegression

#모델 객체 생성
reg = LinearRegression()

# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_))

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[8.46109164] -30.571032410898336
y = 8.461092X + -30.571
MSE: 36.517
RMSE:  6.043
R2:  0.602


# 2. SGDRegressor with hyperparameter

In [16]:
from sklearn.linear_model import SGDRegressor

reg = SGDRegressor(max_iter=1000,eta0=0.01,learning_rate='constant')
# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_[0]))

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[5.17527414] [-17.62358358]
y = 5.175274X + -17.624
MSE: 108.448
RMSE:  10.414
R2:  -0.183


In [8]:
help(SGDRegressor)

Help on class SGDRegressor in module sklearn.linear_model._stochastic_gradient:

class SGDRegressor(BaseSGDRegressor)
 |  SGDRegressor(loss='squared_loss', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)
 |  
 |  Linear model fitted by minimizing a regularized empirical loss with SGD
 |  
 |  SGD stands for Stochastic Gradient Descent: the gradient of the loss is
 |  estimated each sample at a time and the model is updated along the way with
 |  a decreasing strength schedule (aka learning rate).
 |  
 |  The regularizer is a penalty added to the loss function that shrinks model
 |  parameters towards the zero vector using either the squared euclidean norm
 |  L2 or the absolute norm L1 or a combination of both (Elastic Net). If the
 | 

# 3. SGDRegressor with scaling

In [18]:
from sklearn.linear_model import SGDRegressor

#표준화스케일링
train_mean = np.mean(X_train,axis=0)
train_std = np.std(X_train,axis=0)
X_train = (X_train-train_mean) / train_std
X_test = (X_test - train_mean) / train_std

reg = SGDRegressor(max_iter=1000,eta0=0.01,learning_rate='constant')
# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_[0]))

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[5.61959368] [23.3737657]
y = 5.619594X + 23.374
MSE: 38.766
RMSE:  6.226
R2:  0.577


# 4. SGDRegressor with standardScaler()

In [21]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

#표준화스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


reg = SGDRegressor(max_iter=1000000,eta0=0.01, tol=0.0001, learning_rate='constant')
# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_[0]))

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[5.08469365] [22.66851096]
y = 5.084694X + 22.669
MSE: 39.674
RMSE:  6.299
R2:  0.567
