In [2]:
from sklearn.datasets import load_diabetes

In [None]:
# scikit-learn이 제공하는 연습용 데이터 가져오기 
ds = load_diabetes()

In [None]:
# 받은 데이터의 내용 목록 확인 ( 데이터가 Key-Value로 구성되어서 Key값을 통해 확인 )
ds.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [None]:
# 데이터 설명 보기
print( ds['DESCR'] )

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [None]:
# y값 확인 : 연속형 값이므로 회귀 모델 적용 가능
ds['target']

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [29]:
# X 값의 컬럼명 확인
ds['feature_names']

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [9]:
ds['data'] # X values

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]], shape=(442, 10))

In [10]:
import pandas as pd

pd.DataFrame(ds['data'], columns=ds['feature_names'])

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [None]:
# X의 값이 평균 0 동일한 값의 표준편차 분포의 데이터로 정규화된 것 확인
ds['data'].min(axis=0), ds['data'].max(axis=0), ds['data'].mean(axis=0), ds['data'].std(axis=0)

(array([-0.10722563, -0.04464164, -0.0902753 , -0.1123988 , -0.12678067,
        -0.11561307, -0.10230705, -0.0763945 , -0.12609712, -0.13776723]),
 array([0.11072668, 0.05068012, 0.17055523, 0.13204362, 0.15391371,
        0.19878799, 0.18117906, 0.18523444, 0.13359728, 0.13561183]),
 array([-1.44429466e-18,  2.54321451e-18, -2.25592546e-16, -4.85408596e-17,
        -1.42859580e-17,  3.89881064e-17, -6.02836031e-18, -1.78809958e-17,
         9.24348582e-17,  1.35176953e-17]),
 array([0.04756515, 0.04756515, 0.04756515, 0.04756515, 0.04756515,
        0.04756515, 0.04756515, 0.04756515, 0.04756515, 0.04756515]))

In [14]:
from sklearn.linear_model import LinearRegression

In [None]:
# 선형 회귀 모델 만들고 훈련(학습)
lr = LinearRegression()
lr.fit(ds['data'], ds['target'])

In [None]:
# 모델 성능 평가 ( 결정계수 : 0 ~ 1 범위의 값으로 1에 가까울수록 좋은 모델 )
lr.score(ds['data'], ds['target'])

0.5177484222203499

In [None]:
# 학습된 모델을 사용해서 예측 값 계산
y_predict = lr.predict(ds['data'])
y_predict[:10]

array([206.11667725,  68.07103297, 176.88279035, 166.91445843,
       128.46225834, 106.35191443,  73.89134662, 118.85423042,
       158.80889721, 213.58462442])

In [None]:
# (예측값과 실제값의 차)의 합 계산 (오차)
error = np.absolute(y_predict - ds['target']).sum()

# 오차의 평균 계산 ( 오차 합을 데이터 갯수로 나누기 )
error / y_predict.shape[0]

np.float64(43.27745202531506)

In [28]:
ds['target'].mean()

np.float64(152.13348416289594)

In [33]:
import statsmodels.api as sm

In [36]:
X = sm.add_constant(ds['data']) # X값 등록
y = ds['target']

model = sm.OLS(y, X)
result = model.fit()

In [41]:
result.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.507
Method:,Least Squares,F-statistic:,46.27
Date:,"Mon, 21 Apr 2025",Prob (F-statistic):,3.8299999999999998e-62
Time:,10:50:43,Log-Likelihood:,-2386.0
No. Observations:,442,AIC:,4794.0
Df Residuals:,431,BIC:,4839.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,152.1335,2.576,59.061,0.000,147.071,157.196
x1,-10.0099,59.749,-0.168,0.867,-127.446,107.426
x2,-239.8156,61.222,-3.917,0.000,-360.147,-119.484
x3,519.8459,66.533,7.813,0.000,389.076,650.616
x4,324.3846,65.422,4.958,0.000,195.799,452.970
x5,-792.1756,416.680,-1.901,0.058,-1611.153,26.802
x6,476.7390,339.030,1.406,0.160,-189.620,1143.098
x7,101.0433,212.531,0.475,0.635,-316.684,518.770
x8,177.0632,161.476,1.097,0.273,-140.315,494.441

0,1,2,3
Omnibus:,1.506,Durbin-Watson:,2.029
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.404
Skew:,0.017,Prob(JB):,0.496
Kurtosis:,2.726,Cond. No.,227.0
