In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.datasets import load_linnerud    # built-in data
from sklearn.model_selection import train_test_split    # train, test 데이터 분할
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso # 선형 회귀 분석
from sklearn.metrics import r2_score, mean_squared_error

In [17]:
# chins, situps, jumps를 이용해 weight 예측 모델

X, y =  load_linnerud(return_X_y=True)
# X : chins, situps, jumps
# Y : weight, waist, pulse
y = y[:,0]

print(type(X), type(y))
print(X.shape, y.shape)
print(X)
print(y)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(20, 3) (20,)
[[  5. 162.  60.]
 [  2. 110.  60.]
 [ 12. 101. 101.]
 [ 12. 105.  37.]
 [ 13. 155.  58.]
 [  4. 101.  42.]
 [  8. 101.  38.]
 [  6. 125.  40.]
 [ 15. 200.  40.]
 [ 17. 251. 250.]
 [ 17. 120.  38.]
 [ 13. 210. 115.]
 [ 14. 215. 105.]
 [  1.  50.  50.]
 [  6.  70.  31.]
 [ 12. 210. 120.]
 [  4.  60.  25.]
 [ 11. 230.  80.]
 [ 15. 225.  73.]
 [  2. 110.  43.]]
[191. 189. 193. 162. 189. 182. 211. 167. 176. 154. 169. 166. 154. 247.
 193. 202. 176. 157. 156. 138.]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

for m in [LinearRegression(), SGDRegressor(max_iter=10000)] :
    m.fit(X_train, y_train)

    print(m.coef_, m.intercept_)
    print('train score', m.score(X_train, y_train))
    print('test score', m.score(X_test, y_test))
    print('=========================================================')

[ 0.49402866 -0.27433127  0.05463569] 212.78877148077424
train score 0.31399852800761685
test score -0.32991213695066324
[-1.95727798e+10 -1.83570084e+11 -1.39185482e+11] [1.46156232e+10]
train score -2.908594356116873e+24
test score -2.1515525799887522e+24


In [19]:
# overfit, underfit, bias, variance
# bias : 실제값에서 멀어진 정도
# variance : 예측된 값들이 얼마나 떨어져있는가를 나타냄


# overfit : low bias, high variance
# underfit : high bias, low variance


# 과적합을 피해야 한다! 
# 과적합을 방해하는 작업(예방하는 작업) 필요 -> 정규화(Regularization)를 통한 과적합 감소
# Ordinary Least Squares : 에러값을 최소화(OLS)
# Ridge regression : 에러값을 최소화한 값(OLS)에 L2 norm을 더한 것!
# Lasso : 계수(1/2n)* 에러값의 최소화한 값(OLS)에 L1 norem을 더함 (L1 norm의 계수가 알파(alpha) => 노이즈의 정도)

In [20]:
# m = Ridge(alpha=0.1)  # L2 norm
m = Lasso(alpha=0.1)    # L1 norm

m.fit(X_train, y_train)

print(m.coef_, m.intercept_)
print('train score', m.score(X_train, y_train))
print('test score', m.score(X_test, y_test))

[ 0.48264555 -0.27360712  0.05471788] 212.7760120253623
train score 0.31399672117445876
test score -0.32520242685054224


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

for m in [LinearRegression(), SGDRegressor(max_iter=10000), Ridge(alpha=0.1), Lasso(alpha=0.1)] :
    m.fit(X_train, y_train)

    print(m.coef_, m.intercept_)
    print('train score', m.score(X_train, y_train))
    print('test score', m.score(X_test, y_test))
    print('=========================================================')

[ 0.49402866 -0.27433127  0.05463569] 212.78877148077424
train score 0.31399852800761685
test score -0.32991213695066324
[-6.63369887e+10  6.53209344e+08  2.19490784e+10] [7.04864071e+10]
train score -4.184693139974682e+21
test score -8.106067148795169e+20
[ 0.49365994 -0.27430916  0.05464038] 212.78840575465998
train score 0.3139985261328352
test score -0.32975689230914385
[ 0.48264555 -0.27360712  0.05471788] 212.7760120253623
train score 0.31399672117445876
test score -0.32520242685054224
