In [212]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.datasets import load_linnerud    # built-in data
from sklearn.model_selection import train_test_split    # train, test 데이터 분할
from sklearn.linear_model import LinearRegression, SGDRegressor # 선형 회귀 분석
from sklearn.metrics import r2_score, mean_squared_error

In [213]:
def my_line(x, coef=1.0, intercept=1.0) :
  return x*coef+intercept

def err(y, y_pred) :
  return y - y_pred

def sqerr(y, y_pred) :
  return ((y-y_pred)**2).sum()


# rsq의 결과값이 작을수록 예측 모델이 더 좋음을 나타냄 (평균값 대비 모델이 좋은지 판단하는 척도)
def rsq(y, y_pred) :
  return 1 - (sqerr(y, y_pred) / sqerr(y, y.mean()))

In [214]:
linnerud = load_linnerud()
data = linnerud['data']
target = linnerud['target']
descr = linnerud['DESCR']

# print(data)
# print(target)
print(descr)

.. _linnerrud_dataset:

Linnerrud dataset
-----------------

**Data Set Characteristics:**

    :Number of Instances: 20
    :Number of Attributes: 3
    :Missing Attribute Values: None

The Linnerud dataset is a multi-output regression dataset. It consists of three
exercise (data) and three physiological (target) variables collected from
twenty middle-aged men in a fitness club:

- *physiological* - CSV containing 20 observations on 3 physiological variables:
   Weight, Waist and Pulse.
- *exercise* - CSV containing 20 observations on 3 exercise variables:
   Chins, Situps and Jumps.

.. topic:: References

  * Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris:
    Editions Technic.



In [215]:
# 데이터셋 가져오기
y, X = load_linnerud(return_X_y=True)

print(type(X), type(y))
print(X.shape, y.shape)
print(X) # weight, Waist, Pulse  // 신체 정도
print(y) # Chins, Situps, Jumps  // 운동 능력

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(20, 3) (20, 3)
[[191.  36.  50.]
 [189.  37.  52.]
 [193.  38.  58.]
 [162.  35.  62.]
 [189.  35.  46.]
 [182.  36.  56.]
 [211.  38.  56.]
 [167.  34.  60.]
 [176.  31.  74.]
 [154.  33.  56.]
 [169.  34.  50.]
 [166.  33.  52.]
 [154.  34.  64.]
 [247.  46.  50.]
 [193.  36.  46.]
 [202.  37.  62.]
 [176.  37.  54.]
 [157.  32.  52.]
 [156.  33.  54.]
 [138.  33.  68.]]
[[  5. 162.  60.]
 [  2. 110.  60.]
 [ 12. 101. 101.]
 [ 12. 105.  37.]
 [ 13. 155.  58.]
 [  4. 101.  42.]
 [  8. 101.  38.]
 [  6. 125.  40.]
 [ 15. 200.  40.]
 [ 17. 251. 250.]
 [ 17. 120.  38.]
 [ 13. 210. 115.]
 [ 14. 215. 105.]
 [  1.  50.  50.]
 [  6.  70.  31.]
 [ 12. 210. 120.]
 [  4.  60.  25.]
 [ 11. 230.  80.]
 [ 15. 225.  73.]
 [  2. 110.  43.]]


In [216]:
# weight, waist, pulse를 이용해 situps 갯수를 예측하는 회귀모델

# situp에 대한 정보만 추출 (1차원 배열로 만듦)
y = y[:, 1] 

In [217]:
# LinearRegression 사용 coef, intercept, r2 값 확인

# data split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

# create model
m = LinearRegression()

# fit
m.fit(X_train, y_train)


# check coef, intercept, r2
print(m.coef_)
print(m.intercept_)

train_score_lr = m.score(X_train, y_train)
train_r2_lr = r2_score(y_train, m.predict(X_train))
print(train_score_lr, train_r2_lr)

test_score_lr = m.score(X_test, y_test)
test_r2_lr = r2_score(y_test, m.predict(X_test))
print(test_score_lr, test_r2_lr)

# _, axe = plt.subplots()
# axe.scatter(X_train[:,0], y_train)
# axe.plot(X_train[:,0], m.predict(X_train), c='b')

[  0.32839127 -15.85832621   1.71141571]
544.6419986110332
0.3732027444047571 0.3732027444047571
0.29678137835789054 0.29678137835789054


In [218]:
# SGDRegressor 사용 coef, intercept, r2 값 확인

# data split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

# create model
m = SGDRegressor()

# fit
m.fit(X_train, y_train)


# check coef, intercept, r2
print(m.coef_)
print(m.intercept_)

train_score_sg = m.score(X_train, y_train)
train_r2_sg = r2_score(y_train, m.predict(X_train))
print(train_score_sg, train_r2_sg)

test_score_sg = m.score(X_test, y_test)
test_r2_sg = r2_score(y_test, m.predict(X_test))
print(test_score_sg, test_r2_sg)

# _, axe = plt.subplots()
# axe.scatter(X_train[:,0], y_train)
# axe.plot(X_train[:,0], m.predict(X_train), c='b')

[-3.30670739e+11  9.11880432e+10 -3.20051131e+11]
[1.18137871e+09]
-1.8291352227815113e+24 -1.8291352227815113e+24
-1.0048103435113629e+24 -1.0048103435113629e+24
